Ubernetes Lite: Volumes can dictate zone scheduling

For AWS EBS, a volume can only be attached to a node in the same AZ.
The scheduler must therefore detect if a volume is being attached to a
pod, and ensure that the pod is scheduled on a node in the same AZ as
the volume.

So that the scheduler need not query the cloud provider every time, and
to support decoupled operation (e.g. bare metal) we tag the volume with
our placement labels.  This is done automatically by means of an
admission controller on AWS when a PersistentVolume is created backed by
an EBS volume.

Support for tagging GCE PVs will follow.

Pods that specify a volume directly (i.e. without using a
PersistentVolumeClaim) will not currently be scheduled correctly (i.e.
they will be scheduled without zone-awareness).
This commit is contained in:
Justin Santa Barbara
2015-11-29 14:00:49 -05:00
parent 7743a4ca89
commit f9a6ac077e
15 changed files with 536 additions and 4 deletions

View File

@@ -0,0 +1,131 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package label
import (
"fmt"
"io"
"sync"
"k8s.io/kubernetes/pkg/admission"
"k8s.io/kubernetes/pkg/api"
client "k8s.io/kubernetes/pkg/client/unversioned"
"k8s.io/kubernetes/pkg/cloudprovider"
"k8s.io/kubernetes/pkg/cloudprovider/providers/aws"
)
func init() {
admission.RegisterPlugin("PersistentVolumeLabel", func(client client.Interface, config io.Reader) (admission.Interface, error) {
persistentVolumeLabelAdmission := NewPersistentVolumeLabel()
return persistentVolumeLabelAdmission, nil
})
}
var _ = admission.Interface(&persistentVolumeLabel{})
type persistentVolumeLabel struct {
*admission.Handler
mutex sync.Mutex
ebsVolumes aws.Volumes
}
// NewPersistentVolumeLabel returns an admission.Interface implementation which adds labels to PersistentVolume CREATE requests,
// based on the labels provided by the underlying cloud provider.
//
// As a side effect, the cloud provider may block invalid or non-existent volumes.
func NewPersistentVolumeLabel() *persistentVolumeLabel {
return &persistentVolumeLabel{
Handler: admission.NewHandler(admission.Create),
}
}
func (l *persistentVolumeLabel) Admit(a admission.Attributes) (err error) {
if a.GetResource() != api.Resource("persistentvolumes") {
return nil
}
obj := a.GetObject()
if obj == nil {
return nil
}
volume, ok := obj.(*api.PersistentVolume)
if !ok {
return nil
}
var volumeLabels map[string]string
if volume.Spec.AWSElasticBlockStore != nil {
labels, err := l.findAWSEBSLabels(volume)
if err != nil {
return admission.NewForbidden(a, fmt.Errorf("error querying AWS EBS volume %s: %v", volume.Spec.AWSElasticBlockStore.VolumeID, err))
}
volumeLabels = labels
}
if len(volumeLabels) != 0 {
if volume.Labels == nil {
volume.Labels = make(map[string]string)
}
for k, v := range volumeLabels {
// We (silently) replace labels if they are provided.
// This should be OK because they are in the kubernetes.io namespace
// i.e. we own them
volume.Labels[k] = v
}
}
return nil
}
func (l *persistentVolumeLabel) findAWSEBSLabels(volume *api.PersistentVolume) (map[string]string, error) {
ebsVolumes, err := l.getEBSVolumes()
if err != nil {
return nil, err
}
if ebsVolumes == nil {
return nil, fmt.Errorf("unable to build AWS cloud provider for EBS")
}
// TODO: GetVolumeLabels is actually a method on the Volumes interface
// If that gets standardized we can refactor to reduce code duplication
labels, err := ebsVolumes.GetVolumeLabels(volume.Spec.AWSElasticBlockStore.VolumeID)
if err != nil {
return nil, err
}
return labels, err
}
// getEBSVolumes returns the AWS Volumes interface for ebs
func (l *persistentVolumeLabel) getEBSVolumes() (aws.Volumes, error) {
l.mutex.Lock()
defer l.mutex.Unlock()
if l.ebsVolumes == nil {
cloudProvider, err := cloudprovider.GetCloudProvider("aws", nil)
if err != nil || cloudProvider == nil {
return nil, err
}
awsCloudProvider, ok := cloudProvider.(*aws.AWSCloud)
if !ok {
// GetCloudProvider has gone very wrong
return nil, fmt.Errorf("error retrieving AWS cloud provider")
}
l.ebsVolumes = awsCloudProvider
}
return l.ebsVolumes, nil
}

View File

@@ -0,0 +1,154 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package label
import (
"testing"
"fmt"
"k8s.io/kubernetes/pkg/admission"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/cloudprovider/providers/aws"
)
type mockVolumes struct {
volumeLabels map[string]string
volumeLabelsError error
}
var _ aws.Volumes = &mockVolumes{}
func (v *mockVolumes) AttachDisk(instanceName string, volumeName string, readOnly bool) (string, error) {
return "", fmt.Errorf("not implemented")
}
func (v *mockVolumes) DetachDisk(instanceName string, volumeName string) error {
return fmt.Errorf("not implemented")
}
func (v *mockVolumes) CreateVolume(volumeOptions *aws.VolumeOptions) (volumeName string, err error) {
return "", fmt.Errorf("not implemented")
}
func (v *mockVolumes) DeleteVolume(volumeName string) error {
return fmt.Errorf("not implemented")
}
func (v *mockVolumes) GetVolumeLabels(volumeName string) (map[string]string, error) {
return v.volumeLabels, v.volumeLabelsError
}
func mockVolumeFailure(err error) *mockVolumes {
return &mockVolumes{volumeLabelsError: err}
}
func mockVolumeLabels(labels map[string]string) *mockVolumes {
return &mockVolumes{volumeLabels: labels}
}
// TestAdmission
func TestAdmission(t *testing.T) {
pvHandler := NewPersistentVolumeLabel()
handler := admission.NewChainHandler(pvHandler)
ignoredPV := api.PersistentVolume{
ObjectMeta: api.ObjectMeta{Name: "noncloud", Namespace: "myns"},
Spec: api.PersistentVolumeSpec{
PersistentVolumeSource: api.PersistentVolumeSource{
HostPath: &api.HostPathVolumeSource{
Path: "/",
},
},
},
}
awsPV := api.PersistentVolume{
ObjectMeta: api.ObjectMeta{Name: "noncloud", Namespace: "myns"},
Spec: api.PersistentVolumeSpec{
PersistentVolumeSource: api.PersistentVolumeSource{
AWSElasticBlockStore: &api.AWSElasticBlockStoreVolumeSource{
VolumeID: "123",
},
},
},
}
// Non-cloud PVs are ignored
err := handler.Admit(admission.NewAttributesRecord(&ignoredPV, api.Kind("PersistentVolume"), ignoredPV.Namespace, ignoredPV.Name, api.Resource("persistentvolumes"), "", admission.Create, nil))
if err != nil {
t.Errorf("Unexpected error returned from admission handler (on ignored pv): %v", err)
}
// We only add labels on creation
err = handler.Admit(admission.NewAttributesRecord(&awsPV, api.Kind("PersistentVolume"), awsPV.Namespace, awsPV.Name, api.Resource("persistentvolumes"), "", admission.Delete, nil))
if err != nil {
t.Errorf("Unexpected error returned from admission handler (when deleting aws pv): %v", err)
}
// Errors from the cloudprovider block creation of the volume
pvHandler.ebsVolumes = mockVolumeFailure(fmt.Errorf("invalid volume"))
err = handler.Admit(admission.NewAttributesRecord(&awsPV, api.Kind("PersistentVolume"), awsPV.Namespace, awsPV.Name, api.Resource("persistentvolumes"), "", admission.Create, nil))
if err == nil {
t.Errorf("Expected error when aws pv info fails")
}
// Don't add labels if the cloudprovider doesn't return any
labels := make(map[string]string)
pvHandler.ebsVolumes = mockVolumeLabels(labels)
err = handler.Admit(admission.NewAttributesRecord(&awsPV, api.Kind("PersistentVolume"), awsPV.Namespace, awsPV.Name, api.Resource("persistentvolumes"), "", admission.Create, nil))
if err != nil {
t.Errorf("Expected no error when creating aws pv")
}
if len(awsPV.ObjectMeta.Labels) != 0 {
t.Errorf("Unexpected number of labels")
}
// Don't panic if the cloudprovider returns nil, nil
pvHandler.ebsVolumes = mockVolumeFailure(nil)
err = handler.Admit(admission.NewAttributesRecord(&awsPV, api.Kind("PersistentVolume"), awsPV.Namespace, awsPV.Name, api.Resource("persistentvolumes"), "", admission.Create, nil))
if err != nil {
t.Errorf("Expected no error when cloud provider returns empty labels")
}
// Labels from the cloudprovider should be applied to the volume
labels = make(map[string]string)
labels["a"] = "1"
labels["b"] = "2"
pvHandler.ebsVolumes = mockVolumeLabels(labels)
err = handler.Admit(admission.NewAttributesRecord(&awsPV, api.Kind("PersistentVolume"), awsPV.Namespace, awsPV.Name, api.Resource("persistentvolumes"), "", admission.Create, nil))
if err != nil {
t.Errorf("Expected no error when creating aws pv")
}
if awsPV.Labels["a"] != "1" || awsPV.Labels["b"] != "2" {
t.Errorf("Expected label a to be added when creating aws pv")
}
// User-provided labels should be honored, but cloudprovider labels replace them when they overlap
awsPV.ObjectMeta.Labels = make(map[string]string)
awsPV.ObjectMeta.Labels["a"] = "not1"
awsPV.ObjectMeta.Labels["c"] = "3"
err = handler.Admit(admission.NewAttributesRecord(&awsPV, api.Kind("PersistentVolume"), awsPV.Namespace, awsPV.Name, api.Resource("persistentvolumes"), "", admission.Create, nil))
if err != nil {
t.Errorf("Expected no error when creating aws pv")
}
if awsPV.Labels["a"] != "1" || awsPV.Labels["b"] != "2" {
t.Errorf("Expected cloudprovider labels to replace user labels when creating aws pv")
}
if awsPV.Labels["c"] != "3" {
t.Errorf("Expected (non-conflicting) user provided labels to be honored when creating aws pv")
}
}

View File

@@ -0,0 +1,19 @@
/*
Copyright 2014 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// labels created persistent volumes with zone information
// as provided by the cloud provider
package label

View File

@@ -26,12 +26,21 @@ import (
"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
"github.com/golang/glog"
"k8s.io/kubernetes/pkg/api/unversioned"
)
type NodeInfo interface {
GetNodeInfo(nodeID string) (*api.Node, error)
}
type PersistentVolumeInfo interface {
GetPersistentVolumeInfo(pvID string) (*api.PersistentVolume, error)
}
type PersistentVolumeClaimInfo interface {
GetPersistentVolumeClaimInfo(namespace string, pvcID string) (*api.PersistentVolumeClaim, error)
}
type StaticNodeInfo struct {
*api.NodeList
}
@@ -136,6 +145,108 @@ func NoDiskConflict(pod *api.Pod, existingPods []*api.Pod, node string) (bool, e
return true, nil
}
type VolumeZoneChecker struct {
nodeInfo NodeInfo
pvInfo PersistentVolumeInfo
pvcInfo PersistentVolumeClaimInfo
}
// VolumeZonePredicate evaluates if a pod can fit due to the volumes it requests, given
// that some volumes may have zone scheduling constraints. The requirement is that any
// volume zone-labels must match the equivalent zone-labels on the node. It is OK for
// the node to have more zone-label constraints (for example, a hypothetical replicated
// volume might allow region-wide access)
//
// Currently this is only supported with PersistentVolumeClaims, and looks to the labels
// only on the bound PersistentVolume.
//
// Working with volumes declared inline in the pod specification (i.e. not
// using a PersistentVolume) is likely to be harder, as it would require
// determining the zone of a volume during scheduling, and that is likely to
// require calling out to the cloud provider. It seems that we are moving away
// from inline volume declarations anyway.
func NewVolumeZonePredicate(nodeInfo NodeInfo, pvInfo PersistentVolumeInfo, pvcInfo PersistentVolumeClaimInfo) algorithm.FitPredicate {
c := &VolumeZoneChecker{
nodeInfo: nodeInfo,
pvInfo: pvInfo,
pvcInfo: pvcInfo,
}
return c.predicate
}
func (c *VolumeZoneChecker) predicate(pod *api.Pod, existingPods []*api.Pod, nodeID string) (bool, error) {
node, err := c.nodeInfo.GetNodeInfo(nodeID)
if err != nil {
return false, err
}
if node == nil {
return false, fmt.Errorf("node not found: %q", nodeID)
}
nodeConstraints := make(map[string]string)
for k, v := range node.ObjectMeta.Labels {
if k != unversioned.LabelZoneFailureDomain && k != unversioned.LabelZoneRegion {
continue
}
nodeConstraints[k] = v
}
if len(nodeConstraints) == 0 {
// The node has no zone constraints, so we're OK to schedule.
// In practice, when using zones, all nodes must be labeled with zone labels.
// We want to fast-path this case though.
return true, nil
}
namespace := pod.Namespace
manifest := &(pod.Spec)
for i := range manifest.Volumes {
volume := &manifest.Volumes[i]
if volume.PersistentVolumeClaim != nil {
pvcName := volume.PersistentVolumeClaim.ClaimName
if pvcName == "" {
return false, fmt.Errorf("PersistentVolumeClaim had no name: %q", pvcName)
}
pvc, err := c.pvcInfo.GetPersistentVolumeClaimInfo(namespace, pvcName)
if err != nil {
return false, err
}
if pvc == nil {
return false, fmt.Errorf("PersistentVolumeClaim was not found: %q", pvcName)
}
pvName := pvc.Spec.VolumeName
if pvName == "" {
return false, fmt.Errorf("PersistentVolumeClaim is not bound: %q", pvcName)
}
pv, err := c.pvInfo.GetPersistentVolumeInfo(pvName)
if err != nil {
return false, err
}
if pv == nil {
return false, fmt.Errorf("PersistentVolume not found: %q", pvName)
}
for k, v := range pv.ObjectMeta.Labels {
if k != unversioned.LabelZoneFailureDomain && k != unversioned.LabelZoneRegion {
continue
}
nodeV, _ := nodeConstraints[k]
if v != nodeV {
glog.V(2).Infof("Won't schedule pod %q onto node %q due to volume %q (mismatch on %q)", pod.Name, nodeID, pvName, k)
return false, nil
}
}
}
}
return true, nil
}
type ResourceFit struct {
info NodeInfo
}

View File

@@ -64,6 +64,13 @@ func defaultPredicates() sets.String {
),
// Fit is determined by non-conflicting disk volumes.
factory.RegisterFitPredicate("NoDiskConflict", predicates.NoDiskConflict),
// Fit is determined by volume zone requirements.
factory.RegisterFitPredicateFactory(
"NoVolumeZoneConflict",
func(args factory.PluginFactoryArgs) algorithm.FitPredicate {
return predicates.NewVolumeZonePredicate(args.NodeInfo, args.PVInfo, args.PVCInfo)
},
),
// Fit is determined by node selector query.
factory.RegisterFitPredicateFactory(
"MatchNodeSelector",

View File

@@ -58,6 +58,10 @@ type ConfigFactory struct {
PodLister algorithm.PodLister
// a means to list all nodes
NodeLister *cache.StoreToNodeLister
// a means to list all PersistentVolumes
PVLister *cache.StoreToPVFetcher
// a means to list all PersistentVolumeClaims
PVCLister *cache.StoreToPVCFetcher
// a means to list all services
ServiceLister *cache.StoreToServiceLister
// a means to list all controllers
@@ -85,6 +89,8 @@ func NewConfigFactory(client *client.Client, rateLimiter util.RateLimiter, sched
ScheduledPodLister: &cache.StoreToPodLister{},
// Only nodes in the "Ready" condition with status == "True" are schedulable
NodeLister: &cache.StoreToNodeLister{Store: cache.NewStore(cache.MetaNamespaceKeyFunc)},
PVLister: &cache.StoreToPVFetcher{Store: cache.NewStore(cache.MetaNamespaceKeyFunc)},
PVCLister: &cache.StoreToPVCFetcher{Store: cache.NewStore(cache.MetaNamespaceKeyFunc)},
ServiceLister: &cache.StoreToServiceLister{Store: cache.NewStore(cache.MetaNamespaceKeyFunc)},
ControllerLister: &cache.StoreToReplicationControllerLister{Store: cache.NewStore(cache.MetaNamespaceKeyFunc)},
StopEverything: make(chan struct{}),
@@ -188,6 +194,8 @@ func (f *ConfigFactory) CreateFromKeys(predicateKeys, priorityKeys sets.String,
// All fit predicates only need to consider schedulable nodes.
NodeLister: f.NodeLister.NodeCondition(getNodeConditionPredicate()),
NodeInfo: &predicates.CachedNodeInfo{f.NodeLister},
PVInfo: f.PVLister,
PVCInfo: f.PVCLister,
}
predicateFuncs, err := getFitPredicateFunctions(predicateKeys, pluginArgs)
if err != nil {
@@ -209,6 +217,11 @@ func (f *ConfigFactory) CreateFromKeys(predicateKeys, priorityKeys sets.String,
// Nodes may be listed frequently, so provide a local up-to-date cache.
cache.NewReflector(f.createNodeLW(), &api.Node{}, f.NodeLister.Store, 0).RunUntil(f.StopEverything)
// Watch PVs & PVCs
// They may be listed frequently for scheduling constraints, so provide a local up-to-date cache.
cache.NewReflector(f.createPersistentVolumeLW(), &api.PersistentVolume{}, f.PVLister.Store, 0).RunUntil(f.StopEverything)
cache.NewReflector(f.createPersistentVolumeClaimLW(), &api.PersistentVolumeClaim{}, f.PVCLister.Store, 0).RunUntil(f.StopEverything)
// Watch and cache all service objects. Scheduler needs to find all pods
// created by the same services or ReplicationControllers, so that it can spread them correctly.
// Cache this locally.
@@ -303,6 +316,16 @@ func (factory *ConfigFactory) createNodeLW() *cache.ListWatch {
return cache.NewListWatchFromClient(factory.Client, "nodes", api.NamespaceAll, fields)
}
// createPersistentVolumeLW returns a cache.ListWatch that gets all changes to persistentVolumes.
func (factory *ConfigFactory) createPersistentVolumeLW() *cache.ListWatch {
return cache.NewListWatchFromClient(factory.Client, "persistentVolumes", api.NamespaceAll, fields.ParseSelectorOrDie(""))
}
// createPersistentVolumeClaimLW returns a cache.ListWatch that gets all changes to persistentVolumeClaims.
func (factory *ConfigFactory) createPersistentVolumeClaimLW() *cache.ListWatch {
return cache.NewListWatchFromClient(factory.Client, "persistentVolumeClaims", api.NamespaceAll, fields.ParseSelectorOrDie(""))
}
// Returns a cache.ListWatch that gets all changes to services.
func (factory *ConfigFactory) createServiceLW() *cache.ListWatch {
return cache.NewListWatchFromClient(factory.Client, "services", api.NamespaceAll, fields.ParseSelectorOrDie(""))

View File

@@ -38,6 +38,8 @@ type PluginFactoryArgs struct {
algorithm.ControllerLister
NodeLister algorithm.NodeLister
NodeInfo predicates.NodeInfo
PVInfo predicates.PersistentVolumeInfo
PVCInfo predicates.PersistentVolumeClaimInfo
}
// A FitPredicateFactory produces a FitPredicate from the given args.