Merge pull request #42116 from vishh/gpu-experimental-support

Automatic merge from submit-queue Extend experimental support to multiple Nvidia GPUs Extended from #28216 ```release-note `--experimental-nvidia-gpus` flag is **replaced** by `Accelerators` alpha feature gate along with support for multiple Nvidia GPUs. To use GPUs, pass `Accelerators=true` as part of `--feature-gates` flag. Works only with Docker runtime. ``` 1. Automated testing for this PR is not possible since creation of clusters with GPUs isn't supported yet in GCP. 1. To test this PR locally, use the node e2e. ```shell TEST_ARGS='--feature-gates=DynamicKubeletConfig=true' FOCUS=GPU SKIP="" make test-e2e-node ``` TODO: - [x] Run manual tests - [x] Add node e2e - [x] Add unit tests for GPU manager (< 100% coverage) - [ ] Add unit tests in kubelet package
2017-03-01 04:52:50 -08:00 · 2017-03-01 04:52:50 -08:00 · ed479163fa
commit ed479163fa
parent 77ddbb8e73 13582a65aa
25 changed files with 866 additions and 103 deletions
--- a/cmd/kubelet/app/options/options.go
+++ b/cmd/kubelet/app/options/options.go
@ -206,7 +206,6 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) {
 	fs.BoolVar(&s.BabysitDaemons, "babysit-daemons", s.BabysitDaemons, "If true, the node has babysitter process monitoring docker and kubelet.")
 	fs.MarkDeprecated("babysit-daemons", "Will be removed in a future version.")
 	fs.Int32Var(&s.MaxPods, "max-pods", s.MaxPods, "Number of Pods that can run on this Kubelet.")
-	fs.Int32Var(&s.NvidiaGPUs, "experimental-nvidia-gpus", s.NvidiaGPUs, "Number of NVIDIA GPU devices on this node. Only 0 (default) and 1 are currently supported.")
 	// TODO(#40229): Remove the docker-exec-handler flag.
 	fs.StringVar(&s.DockerExecHandlerName, "docker-exec-handler", s.DockerExecHandlerName, "Handler to use when executing a command in a container. Valid values are 'native' and 'nsenter'. Defaults to 'native'.")
 	fs.MarkDeprecated("docker-exec-handler", "this flag will be removed and only the 'native' handler will be supported in the future.")
--- a/hack/verify-flags/known-flags.txt
+++ b/hack/verify-flags/known-flags.txt
@ -690,3 +690,4 @@ windows-line-endings
 www-prefix
 zone-id
 zone-name
+
--- a/pkg/apis/componentconfig/types.go
+++ b/pkg/apis/componentconfig/types.go
@ -362,8 +362,6 @@ type KubeletConfiguration struct {
 	BabysitDaemons bool
 	// maxPods is the number of pods that can run on this Kubelet.
 	MaxPods int32
-	// nvidiaGPUs is the number of NVIDIA GPU devices on this node.
-	NvidiaGPUs int32
 	// dockerExecHandlerName is the handler to use when executing a command
 	// in a container. Valid values are 'native' and 'nsenter'. Defaults to
 	// 'native'.
--- a/pkg/apis/componentconfig/v1alpha1/types.go
+++ b/pkg/apis/componentconfig/v1alpha1/types.go
@ -407,8 +407,6 @@ type KubeletConfiguration struct {
 	BabysitDaemons bool `json:"babysitDaemons"`
 	// maxPods is the number of pods that can run on this Kubelet.
 	MaxPods int32 `json:"maxPods"`
-	// nvidiaGPUs is the number of NVIDIA GPU devices on this node.
-	NvidiaGPUs int32 `json:"nvidiaGPUs"`
 	// dockerExecHandlerName is the handler to use when executing a command
 	// in a container. Valid values are 'native' and 'nsenter'. Defaults to
 	// 'native'.
--- a/pkg/apis/componentconfig/v1alpha1/zz_generated.conversion.go
+++ b/pkg/apis/componentconfig/v1alpha1/zz_generated.conversion.go
@ -353,7 +353,6 @@ func autoConvert_v1alpha1_KubeletConfiguration_To_componentconfig_KubeletConfigu
 	out.HairpinMode = in.HairpinMode
 	out.BabysitDaemons = in.BabysitDaemons
 	out.MaxPods = in.MaxPods
-	out.NvidiaGPUs = in.NvidiaGPUs
 	out.DockerExecHandlerName = in.DockerExecHandlerName
 	out.PodCIDR = in.PodCIDR
 	out.ResolverConfig = in.ResolverConfig
@ -531,7 +530,6 @@ func autoConvert_componentconfig_KubeletConfiguration_To_v1alpha1_KubeletConfigu
 	out.HairpinMode = in.HairpinMode
 	out.BabysitDaemons = in.BabysitDaemons
 	out.MaxPods = in.MaxPods
-	out.NvidiaGPUs = in.NvidiaGPUs
 	out.DockerExecHandlerName = in.DockerExecHandlerName
 	out.PodCIDR = in.PodCIDR
 	out.ResolverConfig = in.ResolverConfig
--- a/pkg/features/kube_features.go
+++ b/pkg/features/kube_features.go
@ -73,6 +73,14 @@ const (
 	// Determines if affinity defined in annotations should be processed
 	// TODO: remove when alpha support for affinity is removed
 	AffinityInAnnotations utilfeature.Feature = "AffinityInAnnotations"
+
+	// owner: @vishh
+	// alpha: v1.6
+	//
+	// Enables support for GPUs as a schedulable resource.
+	// Only Nvidia GPUs are supported as of v1.6.
+	// Works only with Docker Container Runtime.
+	Accelerators utilfeature.Feature = "Accelerators"
 )

 func init() {
@ -90,6 +98,7 @@ var defaultKubernetesFeatureGates = map[utilfeature.Feature]utilfeature.FeatureS
 	ExperimentalHostUserNamespaceDefaultingGate: {Default: false, PreRelease: utilfeature.Beta},
 	ExperimentalCriticalPodAnnotation:           {Default: false, PreRelease: utilfeature.Alpha},
 	AffinityInAnnotations:                       {Default: false, PreRelease: utilfeature.Alpha},
+	Accelerators:                                {Default: false, PreRelease: utilfeature.Alpha},

 	// inherited features from generic apiserver, relisted here to get a conflict if it is changed
 	// unintentionally on either side:
--- a/pkg/generated/openapi/zz_generated.openapi.go
+++ b/pkg/generated/openapi/zz_generated.openapi.go
@ -13153,13 +13153,6 @@ func GetOpenAPIDefinitions(ref openapi.ReferenceCallback) map[string]openapi.Ope
 								Format:      "int32",
 							},
 						},
-						"nvidiaGPUs": {
-							SchemaProps: spec.SchemaProps{
-								Description: "nvidiaGPUs is the number of NVIDIA GPU devices on this node.",
-								Type:        []string{"integer"},
-								Format:      "int32",
-							},
-						},
 						"dockerExecHandlerName": {
 							SchemaProps: spec.SchemaProps{
 								Description: "dockerExecHandlerName is the handler to use when executing a command in a container. Valid values are 'native' and 'nsenter'. Defaults to 'native'.",
@ -13494,7 +13487,7 @@ func GetOpenAPIDefinitions(ref openapi.ReferenceCallback) map[string]openapi.Ope
 							},
 						},
 					},
-					Required: []string{"podManifestPath", "syncFrequency", "fileCheckFrequency", "httpCheckFrequency", "manifestURL", "manifestURLHeader", "enableServer", "address", "port", "readOnlyPort", "tlsCertFile", "tlsPrivateKeyFile", "certDirectory", "authentication", "authorization", "hostnameOverride", "podInfraContainerImage", "dockerEndpoint", "rootDirectory", "seccompProfileRoot", "allowPrivileged", "hostNetworkSources", "hostPIDSources", "hostIPCSources", "registryPullQPS", "registryBurst", "eventRecordQPS", "eventBurst", "enableDebuggingHandlers", "minimumGCAge", "maxPerPodContainerCount", "maxContainerCount", "cAdvisorPort", "healthzPort", "healthzBindAddress", "oomScoreAdj", "registerNode", "clusterDomain", "masterServiceNamespace", "clusterDNS", "streamingConnectionIdleTimeout", "nodeStatusUpdateFrequency", "imageMinimumGCAge", "imageGCHighThresholdPercent", "imageGCLowThresholdPercent", "lowDiskSpaceThresholdMB", "volumeStatsAggPeriod", "networkPluginName", "networkPluginDir", "cniConfDir", "cniBinDir", "networkPluginMTU", "volumePluginDir", "cloudProvider", "cloudConfigFile", "kubeletCgroups", "runtimeCgroups", "systemCgroups", "cgroupRoot", "containerRuntime", "remoteRuntimeEndpoint", "remoteImageEndpoint", "runtimeRequestTimeout", "rktPath", "rktAPIEndpoint", "rktStage1Image", "lockFilePath", "exitOnLockContention", "hairpinMode", "babysitDaemons", "maxPods", "nvidiaGPUs", "dockerExecHandlerName", "podCIDR", "resolvConf", "cpuCFSQuota", "containerized", "maxOpenFiles", "registerSchedulable", "registerWithTaints", "contentType", "kubeAPIQPS", "kubeAPIBurst", "serializeImagePulls", "outOfDiskTransitionFrequency", "nodeIP", "nodeLabels", "nonMasqueradeCIDR", "enableCustomMetrics", "evictionHard", "evictionSoft", "evictionSoftGracePeriod", "evictionPressureTransitionPeriod", "evictionMaxPodGracePeriod", "evictionMinimumReclaim", "experimentalKernelMemcgNotification", "podsPerCore", "enableControllerAttachDetach", "protectKernelDefaults", "makeIPTablesUtilChains", "iptablesMasqueradeBit", "iptablesDropBit", "systemReserved", "kubeReserved"},
+					Required: []string{"podManifestPath", "syncFrequency", "fileCheckFrequency", "httpCheckFrequency", "manifestURL", "manifestURLHeader", "enableServer", "address", "port", "readOnlyPort", "tlsCertFile", "tlsPrivateKeyFile", "certDirectory", "authentication", "authorization", "hostnameOverride", "podInfraContainerImage", "dockerEndpoint", "rootDirectory", "seccompProfileRoot", "allowPrivileged", "hostNetworkSources", "hostPIDSources", "hostIPCSources", "registryPullQPS", "registryBurst", "eventRecordQPS", "eventBurst", "enableDebuggingHandlers", "minimumGCAge", "maxPerPodContainerCount", "maxContainerCount", "cAdvisorPort", "healthzPort", "healthzBindAddress", "oomScoreAdj", "registerNode", "clusterDomain", "masterServiceNamespace", "clusterDNS", "streamingConnectionIdleTimeout", "nodeStatusUpdateFrequency", "imageMinimumGCAge", "imageGCHighThresholdPercent", "imageGCLowThresholdPercent", "lowDiskSpaceThresholdMB", "volumeStatsAggPeriod", "networkPluginName", "networkPluginDir", "cniConfDir", "cniBinDir", "networkPluginMTU", "volumePluginDir", "cloudProvider", "cloudConfigFile", "kubeletCgroups", "runtimeCgroups", "systemCgroups", "cgroupRoot", "containerRuntime", "remoteRuntimeEndpoint", "remoteImageEndpoint", "runtimeRequestTimeout", "rktPath", "rktAPIEndpoint", "rktStage1Image", "lockFilePath", "exitOnLockContention", "hairpinMode", "babysitDaemons", "maxPods", "dockerExecHandlerName", "podCIDR", "resolvConf", "cpuCFSQuota", "containerized", "maxOpenFiles", "registerSchedulable", "registerWithTaints", "contentType", "kubeAPIQPS", "kubeAPIBurst", "serializeImagePulls", "outOfDiskTransitionFrequency", "nodeIP", "nodeLabels", "nonMasqueradeCIDR", "enableCustomMetrics", "evictionHard", "evictionSoft", "evictionSoftGracePeriod", "evictionPressureTransitionPeriod", "evictionMaxPodGracePeriod", "evictionMinimumReclaim", "experimentalKernelMemcgNotification", "podsPerCore", "enableControllerAttachDetach", "protectKernelDefaults", "makeIPTablesUtilChains", "iptablesMasqueradeBit", "iptablesDropBit", "systemReserved", "kubeReserved"},
 				},
 			},
 			Dependencies: []string{
--- a/pkg/kubelet/BUILD
+++ b/pkg/kubelet/BUILD
@ -58,6 +58,8 @@ go_library(
        "//pkg/kubelet/envvars:go_default_library",
        "//pkg/kubelet/events:go_default_library",
        "//pkg/kubelet/eviction:go_default_library",
+        "//pkg/kubelet/gpu:go_default_library",
+        "//pkg/kubelet/gpu/nvidia:go_default_library",
        "//pkg/kubelet/images:go_default_library",
        "//pkg/kubelet/kuberuntime:go_default_library",
        "//pkg/kubelet/lifecycle:go_default_library",
@ -169,6 +171,7 @@ go_test(
        "//pkg/kubelet/container:go_default_library",
        "//pkg/kubelet/container/testing:go_default_library",
        "//pkg/kubelet/eviction:go_default_library",
+        "//pkg/kubelet/gpu:go_default_library",
        "//pkg/kubelet/images:go_default_library",
        "//pkg/kubelet/lifecycle:go_default_library",
        "//pkg/kubelet/network:go_default_library",
@ -246,6 +249,7 @@ filegroup(
        "//pkg/kubelet/envvars:all-srcs",
        "//pkg/kubelet/events:all-srcs",
        "//pkg/kubelet/eviction:all-srcs",
+        "//pkg/kubelet/gpu:all-srcs",
        "//pkg/kubelet/images:all-srcs",
        "//pkg/kubelet/kuberuntime:all-srcs",
        "//pkg/kubelet/leaky:all-srcs",
--- a/pkg/kubelet/gpu/BUILD
+++ b/pkg/kubelet/gpu/BUILD
@ -0,0 +1,34 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])
+
+load(
+    "@io_bazel_rules_go//go:def.bzl",
+    "go_library",
+)
+
+go_library(
+    name = "go_default_library",
+    srcs = [
+        "gpu_manager_stub.go",
+        "types.go",
+    ],
+    tags = ["automanaged"],
+    deps = ["//pkg/api/v1:go_default_library"],
+)
+
+filegroup(
+    name = "package-srcs",
+    srcs = glob(["**"]),
+    tags = ["automanaged"],
+    visibility = ["//visibility:private"],
+)
+
+filegroup(
+    name = "all-srcs",
+    srcs = [
+        ":package-srcs",
+        "//pkg/kubelet/gpu/nvidia:all-srcs",
+    ],
+    tags = ["automanaged"],
+)
--- a/pkg/kubelet/gpu/gpu_manager_stub.go
+++ b/pkg/kubelet/gpu/gpu_manager_stub.go
@ -0,0 +1,41 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package gpu
+
+import (
+	"fmt"
+
+	"k8s.io/kubernetes/pkg/api/v1"
+)
+
+type gpuManagerStub struct{}
+
+func (gms *gpuManagerStub) Start() error {
+	return nil
+}
+
+func (gms *gpuManagerStub) Capacity() v1.ResourceList {
+	return nil
+}
+
+func (gms *gpuManagerStub) AllocateGPU(_ *v1.Pod, _ *v1.Container) ([]string, error) {
+	return nil, fmt.Errorf("GPUs are not supported")
+}
+
+func NewGPUManagerStub() GPUManager {
+	return &gpuManagerStub{}
+}
--- a/pkg/kubelet/gpu/nvidia/BUILD
+++ b/pkg/kubelet/gpu/nvidia/BUILD
@ -0,0 +1,54 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])
+
+load(
+    "@io_bazel_rules_go//go:def.bzl",
+    "go_library",
+    "go_test",
+)
+
+go_library(
+    name = "go_default_library",
+    srcs = [
+        "helpers.go",
+        "nvidia_gpu_manager.go",
+    ],
+    tags = ["automanaged"],
+    deps = [
+        "//pkg/api/v1:go_default_library",
+        "//pkg/kubelet/dockertools:go_default_library",
+        "//pkg/kubelet/gpu:go_default_library",
+        "//vendor:github.com/golang/glog",
+        "//vendor:k8s.io/apimachinery/pkg/api/resource",
+        "//vendor:k8s.io/apimachinery/pkg/util/sets",
+    ],
+)
+
+filegroup(
+    name = "package-srcs",
+    srcs = glob(["**"]),
+    tags = ["automanaged"],
+    visibility = ["//visibility:private"],
+)
+
+filegroup(
+    name = "all-srcs",
+    srcs = [":package-srcs"],
+    tags = ["automanaged"],
+)
+
+go_test(
+    name = "go_default_test",
+    srcs = ["nvidia_gpu_manager_test.go"],
+    library = ":go_default_library",
+    tags = ["automanaged"],
+    deps = [
+        "//pkg/api/v1:go_default_library",
+        "//vendor:github.com/stretchr/testify/assert",
+        "//vendor:k8s.io/apimachinery/pkg/api/resource",
+        "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
+        "//vendor:k8s.io/apimachinery/pkg/util/sets",
+        "//vendor:k8s.io/apimachinery/pkg/util/uuid",
+    ],
+)
--- a/pkg/kubelet/gpu/nvidia/helpers.go
+++ b/pkg/kubelet/gpu/nvidia/helpers.go
@ -0,0 +1,59 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package nvidia
+
+import "k8s.io/apimachinery/pkg/util/sets"
+
+// podGPUs represents a list of pod to GPU mappings.
+type podGPUs struct {
+	podGPUMapping map[string]sets.String
+}
+
+func newPodGPUs() *podGPUs {
+	return &podGPUs{
+		podGPUMapping: map[string]sets.String{},
+	}
+}
+func (pgpu *podGPUs) pods() sets.String {
+	ret := sets.NewString()
+	for k := range pgpu.podGPUMapping {
+		ret.Insert(k)
+	}
+	return ret
+}
+
+func (pgpu *podGPUs) insert(podUID string, device string) {
+	if _, exists := pgpu.podGPUMapping[podUID]; !exists {
+		pgpu.podGPUMapping[podUID] = sets.NewString(device)
+	} else {
+		pgpu.podGPUMapping[podUID].Insert(device)
+	}
+}
+
+func (pgpu *podGPUs) delete(pods []string) {
+	for _, uid := range pods {
+		delete(pgpu.podGPUMapping, uid)
+	}
+}
+
+func (pgpu *podGPUs) devices() sets.String {
+	ret := sets.NewString()
+	for _, devices := range pgpu.podGPUMapping {
+		ret = ret.Union(devices)
+	}
+	return ret
+}
--- a/pkg/kubelet/gpu/nvidia/nvidia_gpu_manager.go
+++ b/pkg/kubelet/gpu/nvidia/nvidia_gpu_manager.go
@ -0,0 +1,279 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package nvidia
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path"
+	"regexp"
+	"sync"
+
+	"github.com/golang/glog"
+
+	"k8s.io/apimachinery/pkg/api/resource"
+	"k8s.io/apimachinery/pkg/util/sets"
+	"k8s.io/kubernetes/pkg/api/v1"
+	"k8s.io/kubernetes/pkg/kubelet/dockertools"
+	"k8s.io/kubernetes/pkg/kubelet/gpu"
+)
+
+// TODO: rework to use Nvidia's NVML, which is more complex, but also provides more fine-grained information and stats.
+const (
+	// All NVIDIA GPUs cards should be mounted with nvidiactl and nvidia-uvm
+	// If the driver installed correctly, the 2 devices will be there.
+	nvidiaCtlDevice string = "/dev/nvidiactl"
+	nvidiaUVMDevice string = "/dev/nvidia-uvm"
+	// Optional device.
+	nvidiaUVMToolsDevice string = "/dev/nvidia-uvm-tools"
+	devDirectory                = "/dev"
+	nvidiaDeviceRE              = `^nvidia[0-9]*$`
+	nvidiaFullpathRE            = `^/dev/nvidia[0-9]*$`
+)
+
+type activePodsLister interface {
+	// Returns a list of active pods on the node.
+	GetRunningPods() ([]*v1.Pod, error)
+}
+
+// nvidiaGPUManager manages nvidia gpu devices.
+type nvidiaGPUManager struct {
+	sync.Mutex
+	// All gpus available on the Node
+	allGPUs        sets.String
+	allocated      *podGPUs
+	defaultDevices []string
+	// The interface which could get GPU mapping from all the containers.
+	// TODO: Should make this independent of Docker in the future.
+	dockerClient     dockertools.DockerInterface
+	activePodsLister activePodsLister
+}
+
+// NewNvidiaGPUManager returns a GPUManager that manages local Nvidia GPUs.
+// TODO: Migrate to use pod level cgroups and make it generic to all runtimes.
+func NewNvidiaGPUManager(activePodsLister activePodsLister, dockerClient dockertools.DockerInterface) (gpu.GPUManager, error) {
+	if dockerClient == nil {
+		return nil, fmt.Errorf("invalid docker client specified")
+	}
+	return &nvidiaGPUManager{
+		allGPUs:          sets.NewString(),
+		dockerClient:     dockerClient,
+		activePodsLister: activePodsLister,
+	}, nil
+}
+
+// Initialize the GPU devices, so far only needed to discover the GPU paths.
+func (ngm *nvidiaGPUManager) Start() error {
+	if ngm.dockerClient == nil {
+		return fmt.Errorf("Invalid docker client specified in GPU Manager")
+	}
+	ngm.Lock()
+	defer ngm.Unlock()
+
+	if _, err := os.Stat(nvidiaCtlDevice); err != nil {
+		return err
+	}
+
+	if _, err := os.Stat(nvidiaUVMDevice); err != nil {
+		return err
+	}
+	ngm.defaultDevices = []string{nvidiaCtlDevice, nvidiaUVMDevice}
+	_, err := os.Stat(nvidiaUVMToolsDevice)
+	if !os.IsNotExist(err) {
+		ngm.defaultDevices = append(ngm.defaultDevices, nvidiaUVMToolsDevice)
+	}
+
+	if err := ngm.discoverGPUs(); err != nil {
+		return err
+	}
+	// It's possible that the runtime isn't available now.
+	allocatedGPUs, err := ngm.gpusInUse()
+	if err == nil {
+		ngm.allocated = allocatedGPUs
+	}
+	// We ignore errors when identifying allocated GPUs because it is possible that the runtime interfaces may be not be logically up.
+	return nil
+}
+
+// Get how many GPU cards we have.
+func (ngm *nvidiaGPUManager) Capacity() v1.ResourceList {
+	gpus := resource.NewQuantity(int64(len(ngm.allGPUs)), resource.DecimalSI)
+	return v1.ResourceList{
+		v1.ResourceNvidiaGPU: *gpus,
+	}
+}
+
+// AllocateGPUs returns `num` GPUs if available, error otherwise.
+// Allocation is made thread safe using the following logic.
+// A list of all GPUs allocated is maintained along with their respective Pod UIDs.
+// It is expected that the list of active pods will not return any false positives.
+// As part of initialization or allocation, the list of GPUs in use will be computed once.
+// Whenever an allocation happens, the list of GPUs allocated is updated based on the list of currently active pods.
+// GPUs allocated to terminated pods are freed up lazily as part of allocation.
+// GPUs are allocated based on the internal list of allocatedGPUs.
+// It is not safe to generate a list of GPUs in use by inspecting active containers because of the delay between GPU allocation and container creation.
+// A GPU allocated to a container might be re-allocated to a subsequent container because the original container wasn't started quick enough.
+// The current algorithm scans containers only once and then uses a list of active pods to track GPU usage.
+// This is a sub-optimal solution and a better alternative would be that of using pod level cgroups instead.
+// GPUs allocated to containers should be reflected in pod level device cgroups before completing allocations.
+// The pod level cgroups will then serve as a checkpoint of GPUs in use.
+func (ngm *nvidiaGPUManager) AllocateGPU(pod *v1.Pod, container *v1.Container) ([]string, error) {
+	gpusNeeded := container.Resources.Limits.NvidiaGPU().Value()
+	if gpusNeeded == 0 {
+		return []string{}, nil
+	}
+	ngm.Lock()
+	defer ngm.Unlock()
+	if ngm.allocated == nil {
+		// Initialization is not complete. Try now. Failures can no longer be tolerated.
+		allocated, err := ngm.gpusInUse()
+		if err != nil {
+			return nil, fmt.Errorf("Failed to allocate GPUs because of issues identifying GPUs in use: %v", err)
+		}
+		ngm.allocated = allocated
+	} else {
+		// update internal list of GPUs in use prior to allocating new GPUs.
+		if err := ngm.updateAllocatedGPUs(); err != nil {
+			return nil, fmt.Errorf("Failed to allocate GPUs because of issues with updating GPUs in use: %v", err)
+		}
+	}
+	// Get GPU devices in use.
+	devicesInUse := ngm.allocated.devices()
+	glog.V(5).Infof("gpus in use: %v", devicesInUse.List())
+	// Get a list of available GPUs.
+	available := ngm.allGPUs.Difference(devicesInUse)
+	glog.V(5).Infof("gpus available: %v", available.List())
+	if int64(available.Len()) < gpusNeeded {
+		return nil, fmt.Errorf("requested number of GPUs unavailable. Requested: %d, Available: %d", gpusNeeded, available.Len())
+	}
+	ret := available.UnsortedList()[:gpusNeeded]
+	for _, device := range ret {
+		// Update internal allocated GPU cache.
+		ngm.allocated.insert(string(pod.UID), device)
+	}
+	// Add standard devices files that needs to be exposed.
+	ret = append(ret, ngm.defaultDevices...)
+
+	return ret, nil
+}
+
+// updateAllocatedGPUs updates the list of GPUs in use.
+// It gets a list of running pods and then frees any GPUs that are bound to terminated pods.
+// Returns error on failure.
+func (ngm *nvidiaGPUManager) updateAllocatedGPUs() error {
+	activePods, err := ngm.activePodsLister.GetRunningPods()
+	if err != nil {
+		return fmt.Errorf("Failed to list active pods: %v", err)
+	}
+	activePodUids := sets.NewString()
+	for _, pod := range activePods {
+		activePodUids.Insert(string(pod.UID))
+	}
+	allocatedPodUids := ngm.allocated.pods()
+	podsToBeRemoved := allocatedPodUids.Difference(activePodUids)
+	glog.V(5).Infof("pods to be removed: %v", podsToBeRemoved.List())
+	ngm.allocated.delete(podsToBeRemoved.List())
+	return nil
+}
+
+// discoverGPUs identifies allGPUs NVIDIA GPU devices available on the local node by walking `/dev` directory.
+// TODO: Without NVML support we only can check whether there has GPU devices, but
+// could not give a health check or get more information like GPU cores, memory, or
+// family name. Need to support NVML in the future. But we do not need NVML until
+// we want more features, features like schedule containers according to GPU family
+// name.
+func (ngm *nvidiaGPUManager) discoverGPUs() error {
+	reg := regexp.MustCompile(nvidiaDeviceRE)
+	files, err := ioutil.ReadDir(devDirectory)
+	if err != nil {
+		return err
+	}
+	for _, f := range files {
+		if f.IsDir() {
+			continue
+		}
+		if reg.MatchString(f.Name()) {
+			glog.V(2).Infof("Found Nvidia GPU %q", f.Name())
+			ngm.allGPUs.Insert(path.Join(devDirectory, f.Name()))
+		}
+	}
+
+	return nil
+}
+
+// gpusInUse returns a list of GPUs in use along with the respective pods that are using it.
+func (ngm *nvidiaGPUManager) gpusInUse() (*podGPUs, error) {
+	pods, err := ngm.activePodsLister.GetRunningPods()
+	if err != nil {
+		return nil, err
+	}
+	type podContainers struct {
+		uid          string
+		containerIDs sets.String
+	}
+	// List of containers to inspect.
+	podContainersToInspect := []podContainers{}
+	for _, pod := range pods {
+		containers := sets.NewString()
+		for _, container := range pod.Spec.Containers {
+			// GPUs are expected to be specified only in limits.
+			if !container.Resources.Limits.NvidiaGPU().IsZero() {
+				containers.Insert(container.Name)
+			}
+		}
+		// If no GPUs were requested skip this pod.
+		if containers.Len() == 0 {
+			continue
+		}
+		containerIDs := sets.NewString()
+		for _, container := range pod.Status.ContainerStatuses {
+			if containers.Has(container.Name) {
+				containerIDs.Insert(container.ContainerID)
+			}
+		}
+		// add the pod and its containers that need to be inspected.
+		podContainersToInspect = append(podContainersToInspect, podContainers{string(pod.UID), containerIDs})
+	}
+	ret := newPodGPUs()
+	for _, podContainer := range podContainersToInspect {
+		for _, containerId := range podContainer.containerIDs.List() {
+			containerJSON, err := ngm.dockerClient.InspectContainer(containerId)
+			if err != nil {
+				glog.V(3).Infof("Failed to inspect container %q in pod %q while attempting to reconcile nvidia gpus in use", containerId, podContainer.uid)
+				continue
+			}
+
+			devices := containerJSON.HostConfig.Devices
+			if devices == nil {
+				continue
+			}
+
+			for _, device := range devices {
+				if isValidPath(device.PathOnHost) {
+					glog.V(4).Infof("Nvidia GPU %q is in use by Docker Container: %q", device.PathOnHost, containerJSON.ID)
+					ret.insert(podContainer.uid, device.PathOnHost)
+				}
+			}
+		}
+	}
+	return ret, nil
+}
+
+func isValidPath(path string) bool {
+	return regexp.MustCompile(nvidiaFullpathRE).MatchString(path)
+}
--- a/pkg/kubelet/gpu/nvidia/nvidia_gpu_manager_test.go
+++ b/pkg/kubelet/gpu/nvidia/nvidia_gpu_manager_test.go
@ -0,0 +1,144 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package nvidia
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/sets"
+	"k8s.io/apimachinery/pkg/util/uuid"
+	"k8s.io/kubernetes/pkg/api/v1"
+)
+
+type testActivePodsLister struct {
+	activePods []*v1.Pod
+}
+
+func (tapl *testActivePodsLister) GetRunningPods() ([]*v1.Pod, error) {
+	return tapl.activePods, nil
+}
+
+func makeTestPod(numContainers int) *v1.Pod {
+	quantity := resource.NewQuantity(1, resource.DecimalSI)
+	resources := v1.ResourceRequirements{
+		Limits: v1.ResourceList{
+			v1.ResourceNvidiaGPU: *quantity,
+		},
+	}
+	pod := &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			UID: uuid.NewUUID(),
+		},
+		Spec: v1.PodSpec{
+			Containers: []v1.Container{},
+		},
+	}
+	for ; numContainers > 0; numContainers-- {
+		pod.Spec.Containers = append(pod.Spec.Containers, v1.Container{
+			Resources: resources,
+		})
+	}
+	return pod
+}
+
+func TestMultiContainerPodGPUAllocation(t *testing.T) {
+	podLister := &testActivePodsLister{}
+
+	testGpuManager := &nvidiaGPUManager{
+		activePodsLister: podLister,
+		allGPUs:          sets.NewString("/dev/nvidia0", "/dev/nvidia1"),
+		allocated:        newPodGPUs(),
+	}
+
+	// Expect that no devices are in use.
+	gpusInUse, err := testGpuManager.gpusInUse()
+	as := assert.New(t)
+	as.Nil(err)
+	as.Equal(len(gpusInUse.devices()), 0)
+
+	// Allocated GPUs for a pod with two containers.
+	pod := makeTestPod(2)
+	// Allocate for the first container.
+	devices1, err := testGpuManager.AllocateGPU(pod, &pod.Spec.Containers[0])
+	as.Nil(err)
+	as.Equal(len(devices1), 1)
+
+	podLister.activePods = append(podLister.activePods, pod)
+	// Allocate for the second container.
+	devices2, err := testGpuManager.AllocateGPU(pod, &pod.Spec.Containers[1])
+	as.Nil(err)
+	as.Equal(len(devices2), 1)
+
+	as.NotEqual(devices1, devices2, "expected containers to get different devices")
+
+	// further allocations should fail.
+	newPod := makeTestPod(2)
+	devices1, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[0])
+	as.NotNil(err, "expected gpu allocation to fail. got: %v", devices1)
+
+	// Now terminate the original pod and observe that GPU allocation for new pod succeeds.
+	podLister.activePods = podLister.activePods[:0]
+
+	devices1, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[0])
+	as.Nil(err)
+	as.Equal(len(devices1), 1)
+
+	podLister.activePods = append(podLister.activePods, newPod)
+
+	devices2, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[1])
+	as.Nil(err)
+	as.Equal(len(devices2), 1)
+
+	as.NotEqual(devices1, devices2, "expected containers to get different devices")
+}
+
+func TestMultiPodGPUAllocation(t *testing.T) {
+	podLister := &testActivePodsLister{}
+
+	testGpuManager := &nvidiaGPUManager{
+		activePodsLister: podLister,
+		allGPUs:          sets.NewString("/dev/nvidia0", "/dev/nvidia1"),
+		allocated:        newPodGPUs(),
+	}
+
+	// Expect that no devices are in use.
+	gpusInUse, err := testGpuManager.gpusInUse()
+	as := assert.New(t)
+	as.Nil(err)
+	as.Equal(len(gpusInUse.devices()), 0)
+
+	// Allocated GPUs for a pod with two containers.
+	podA := makeTestPod(1)
+	// Allocate for the first container.
+	devicesA, err := testGpuManager.AllocateGPU(podA, &podA.Spec.Containers[0])
+	as.Nil(err)
+	as.Equal(len(devicesA), 1)
+
+	podLister.activePods = append(podLister.activePods, podA)
+
+	// further allocations should fail.
+	podB := makeTestPod(1)
+	// Allocate for the first container.
+	devicesB, err := testGpuManager.AllocateGPU(podB, &podB.Spec.Containers[0])
+	as.Nil(err)
+	as.Equal(len(devicesB), 1)
+	as.NotEqual(devicesA, devicesB, "expected pods to get different devices")
+}
--- a/pkg/kubelet/gpu/types.go
+++ b/pkg/kubelet/gpu/types.go
@ -0,0 +1,32 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package gpu
+
+import "k8s.io/kubernetes/pkg/api/v1"
+
+// GPUManager manages GPUs on a local node.
+// Implementations are expected to be thread safe.
+type GPUManager interface {
+	// Start logically initializes GPUManager
+	Start() error
+	// Capacity returns the total number of GPUs on the node.
+	Capacity() v1.ResourceList
+	// AllocateGPU attempts to allocate GPUs for input container.
+	// Returns paths to allocated GPUs and nil on success.
+	// Returns an error on failure.
+	AllocateGPU(*v1.Pod, *v1.Container) ([]string, error)
+}
--- a/pkg/kubelet/kubelet.go
+++ b/pkg/kubelet/kubelet.go
@ -67,6 +67,8 @@ import (
 	"k8s.io/kubernetes/pkg/kubelet/dockertools"
 	"k8s.io/kubernetes/pkg/kubelet/events"
 	"k8s.io/kubernetes/pkg/kubelet/eviction"
+	"k8s.io/kubernetes/pkg/kubelet/gpu"
+	"k8s.io/kubernetes/pkg/kubelet/gpu/nvidia"
 	"k8s.io/kubernetes/pkg/kubelet/images"
 	"k8s.io/kubernetes/pkg/kubelet/kuberuntime"
 	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
@ -450,7 +452,6 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub
 		nonMasqueradeCIDR: kubeCfg.NonMasqueradeCIDR,
 		maxPods:           int(kubeCfg.MaxPods),
 		podsPerCore:       int(kubeCfg.PodsPerCore),
-		nvidiaGPUs:        int(kubeCfg.NvidiaGPUs),
 		syncLoopMonitor:   atomic.Value{},
 		resolverConfig:    kubeCfg.ResolverConfig,
 		cpuCFSQuota:       kubeCfg.CPUCFSQuota,
@ -786,7 +787,16 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub

 	klet.appArmorValidator = apparmor.NewValidator(kubeCfg.ContainerRuntime)
 	klet.softAdmitHandlers.AddPodAdmitHandler(lifecycle.NewAppArmorAdmitHandler(klet.appArmorValidator))
-
+	if utilfeature.DefaultFeatureGate.Enabled(features.Accelerators) {
+		if kubeCfg.ContainerRuntime != "docker" {
+			return nil, fmt.Errorf("Accelerators feature is supported with docker runtime only.")
+		}
+		if klet.gpuManager, err = nvidia.NewNvidiaGPUManager(klet, klet.dockerClient); err != nil {
+			return nil, err
+		}
+	} else {
+		klet.gpuManager = gpu.NewGPUManagerStub()
+	}
 	// Finally, put the most recent version of the config on the Kubelet, so
 	// people can see how it was configured.
 	klet.kubeletConfiguration = *kubeCfg
@ -981,9 +991,6 @@ type Kubelet struct {
 	// Maximum Number of Pods which can be run by this Kubelet
 	maxPods int

-	// Number of NVIDIA GPUs on this node
-	nvidiaGPUs int
-
 	// Monitor Kubelet's sync loop
 	syncLoopMonitor atomic.Value

@ -1089,6 +1096,9 @@ type Kubelet struct {
 	// This should only be enabled when the container runtime is performing user remapping AND if the
 	// experimental behavior is desired.
 	experimentalHostUserNamespaceDefaulting bool
+
+	// GPU Manager
+	gpuManager gpu.GPUManager
 }

 // setupDataDirs creates:
@ -1182,7 +1192,10 @@ func (kl *Kubelet) initializeModules() error {
 		return fmt.Errorf("Failed to start OOM watcher %v", err)
 	}

-	// Step 7: Start resource analyzer
+	// Step 7: Initialize GPUs
+	kl.gpuManager.Start()
+
+	// Step 8: Start resource analyzer
 	kl.resourceAnalyzer.Start()

 	return nil
--- a/pkg/kubelet/kubelet_node_status.go
+++ b/pkg/kubelet/kubelet_node_status.go
@ -482,6 +482,14 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
 		node.Status.Capacity = v1.ResourceList{}
 	}

+	// populate GPU capacity.
+	gpuCapacity := kl.gpuManager.Capacity()
+	if gpuCapacity != nil {
+		for k, v := range gpuCapacity {
+			node.Status.Capacity[k] = v
+		}
+	}
+
 	// TODO: Post NotReady if we cannot get MachineInfo from cAdvisor. This needs to start
 	// cAdvisor locally, e.g. for test-cmd.sh, and in integration test.
 	info, err := kl.GetCachedMachineInfo()
@ -491,8 +499,6 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
 		node.Status.Capacity[v1.ResourceCPU] = *resource.NewMilliQuantity(0, resource.DecimalSI)
 		node.Status.Capacity[v1.ResourceMemory] = resource.MustParse("0Gi")
 		node.Status.Capacity[v1.ResourcePods] = *resource.NewQuantity(int64(kl.maxPods), resource.DecimalSI)
-		node.Status.Capacity[v1.ResourceNvidiaGPU] = *resource.NewQuantity(int64(kl.nvidiaGPUs), resource.DecimalSI)
-
 		glog.Errorf("Error getting machine info: %v", err)
 	} else {
 		node.Status.NodeInfo.MachineID = info.MachineID
@ -509,8 +515,6 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
 			node.Status.Capacity[v1.ResourcePods] = *resource.NewQuantity(
 				int64(kl.maxPods), resource.DecimalSI)
 		}
-		node.Status.Capacity[v1.ResourceNvidiaGPU] = *resource.NewQuantity(
-			int64(kl.nvidiaGPUs), resource.DecimalSI)
 		if node.Status.NodeInfo.BootID != "" &&
 			node.Status.NodeInfo.BootID != info.BootID {
 			// TODO: This requires a transaction, either both node status is updated
--- a/pkg/kubelet/kubelet_node_status_test.go
+++ b/pkg/kubelet/kubelet_node_status_test.go
@ -208,16 +208,14 @@ func TestUpdateNewNodeStatus(t *testing.T) {
 				KubeProxyVersion:        version.Get().String(),
 			},
 			Capacity: v1.ResourceList{
-				v1.ResourceCPU:       *resource.NewMilliQuantity(2000, resource.DecimalSI),
-				v1.ResourceMemory:    *resource.NewQuantity(10E9, resource.BinarySI),
-				v1.ResourcePods:      *resource.NewQuantity(0, resource.DecimalSI),
-				v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
+				v1.ResourceCPU:    *resource.NewMilliQuantity(2000, resource.DecimalSI),
+				v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI),
+				v1.ResourcePods:   *resource.NewQuantity(0, resource.DecimalSI),
 			},
 			Allocatable: v1.ResourceList{
-				v1.ResourceCPU:       *resource.NewMilliQuantity(1800, resource.DecimalSI),
-				v1.ResourceMemory:    *resource.NewQuantity(9900E6, resource.BinarySI),
-				v1.ResourcePods:      *resource.NewQuantity(0, resource.DecimalSI),
-				v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
+				v1.ResourceCPU:    *resource.NewMilliQuantity(1800, resource.DecimalSI),
+				v1.ResourceMemory: *resource.NewQuantity(9900E6, resource.BinarySI),
+				v1.ResourcePods:   *resource.NewQuantity(0, resource.DecimalSI),
 			},
 			Addresses: []v1.NodeAddress{
 				{Type: v1.NodeLegacyHostIP, Address: "127.0.0.1"},
@ -402,10 +400,9 @@ func TestUpdateExistingNodeStatus(t *testing.T) {
 				v1.ResourcePods:   *resource.NewQuantity(0, resource.DecimalSI),
 			},
 			Allocatable: v1.ResourceList{
-				v1.ResourceCPU:       *resource.NewMilliQuantity(2800, resource.DecimalSI),
-				v1.ResourceMemory:    *resource.NewQuantity(19900E6, resource.BinarySI),
-				v1.ResourcePods:      *resource.NewQuantity(0, resource.DecimalSI),
-				v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
+				v1.ResourceCPU:    *resource.NewMilliQuantity(2800, resource.DecimalSI),
+				v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI),
+				v1.ResourcePods:   *resource.NewQuantity(0, resource.DecimalSI),
 			},
 		},
 	}
@ -482,16 +479,14 @@ func TestUpdateExistingNodeStatus(t *testing.T) {
 				KubeProxyVersion:        version.Get().String(),
 			},
 			Capacity: v1.ResourceList{
-				v1.ResourceCPU:       *resource.NewMilliQuantity(2000, resource.DecimalSI),
-				v1.ResourceMemory:    *resource.NewQuantity(20E9, resource.BinarySI),
-				v1.ResourcePods:      *resource.NewQuantity(0, resource.DecimalSI),
-				v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
+				v1.ResourceCPU:    *resource.NewMilliQuantity(2000, resource.DecimalSI),
+				v1.ResourceMemory: *resource.NewQuantity(20E9, resource.BinarySI),
+				v1.ResourcePods:   *resource.NewQuantity(0, resource.DecimalSI),
 			},
 			Allocatable: v1.ResourceList{
-				v1.ResourceCPU:       *resource.NewMilliQuantity(1800, resource.DecimalSI),
-				v1.ResourceMemory:    *resource.NewQuantity(19900E6, resource.BinarySI),
-				v1.ResourcePods:      *resource.NewQuantity(0, resource.DecimalSI),
-				v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
+				v1.ResourceCPU:    *resource.NewMilliQuantity(1800, resource.DecimalSI),
+				v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI),
+				v1.ResourcePods:   *resource.NewQuantity(0, resource.DecimalSI),
 			},
 			Addresses: []v1.NodeAddress{
 				{Type: v1.NodeLegacyHostIP, Address: "127.0.0.1"},
@ -790,16 +785,14 @@ func TestUpdateNodeStatusWithRuntimeStateError(t *testing.T) {
 				KubeProxyVersion:        version.Get().String(),
 			},
 			Capacity: v1.ResourceList{
-				v1.ResourceCPU:       *resource.NewMilliQuantity(2000, resource.DecimalSI),
-				v1.ResourceMemory:    *resource.NewQuantity(10E9, resource.BinarySI),
-				v1.ResourcePods:      *resource.NewQuantity(0, resource.DecimalSI),
-				v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
+				v1.ResourceCPU:    *resource.NewMilliQuantity(2000, resource.DecimalSI),
+				v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI),
+				v1.ResourcePods:   *resource.NewQuantity(0, resource.DecimalSI),
 			},
 			Allocatable: v1.ResourceList{
-				v1.ResourceCPU:       *resource.NewMilliQuantity(1800, resource.DecimalSI),
-				v1.ResourceMemory:    *resource.NewQuantity(9900E6, resource.BinarySI),
-				v1.ResourcePods:      *resource.NewQuantity(0, resource.DecimalSI),
-				v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
+				v1.ResourceCPU:    *resource.NewMilliQuantity(1800, resource.DecimalSI),
+				v1.ResourceMemory: *resource.NewQuantity(9900E6, resource.BinarySI),
+				v1.ResourcePods:   *resource.NewQuantity(0, resource.DecimalSI),
 			},
 			Addresses: []v1.NodeAddress{
 				{Type: v1.NodeLegacyHostIP, Address: "127.0.0.1"},
--- a/pkg/kubelet/kubelet_pods.go
+++ b/pkg/kubelet/kubelet_pods.go
@ -84,20 +84,23 @@ func (kl *Kubelet) getActivePods() []*v1.Pod {
 }

 // makeDevices determines the devices for the given container.
-// Experimental. For now, we hardcode /dev/nvidia0 no matter what the user asks for
-// (we only support one device per node).
-// TODO: add support for more than 1 GPU after #28216.
-func makeDevices(container *v1.Container) []kubecontainer.DeviceInfo {
-	nvidiaGPULimit := container.Resources.Limits.NvidiaGPU()
-	if nvidiaGPULimit.Value() != 0 {
-		return []kubecontainer.DeviceInfo{
-			{PathOnHost: "/dev/nvidia0", PathInContainer: "/dev/nvidia0", Permissions: "mrw"},
-			{PathOnHost: "/dev/nvidiactl", PathInContainer: "/dev/nvidiactl", Permissions: "mrw"},
-			{PathOnHost: "/dev/nvidia-uvm", PathInContainer: "/dev/nvidia-uvm", Permissions: "mrw"},
-		}
+// Experimental.
+func (kl *Kubelet) makeDevices(pod *v1.Pod, container *v1.Container) ([]kubecontainer.DeviceInfo, error) {
+	if container.Resources.Limits.NvidiaGPU().IsZero() {
+		return nil, nil
 	}

-	return nil
+	nvidiaGPUPaths, err := kl.gpuManager.AllocateGPU(pod, container)
+	if err != nil {
+		return nil, err
+	}
+	var devices []kubecontainer.DeviceInfo
+	for _, path := range nvidiaGPUPaths {
+		// Devices have to be mapped one to one because of nvidia CUDA library requirements.
+		devices = append(devices, kubecontainer.DeviceInfo{PathOnHost: path, PathInContainer: path, Permissions: "mrw"})
+	}
+
+	return devices, nil
 }

 // makeMounts determines the mount points for the given container.
@ -285,7 +288,10 @@ func (kl *Kubelet) GenerateRunContainerOptions(pod *v1.Pod, container *v1.Contai

 	opts.PortMappings = kubecontainer.MakePortMappings(container)
 	// TODO(random-liu): Move following convert functions into pkg/kubelet/container
-	opts.Devices = makeDevices(container)
+	opts.Devices, err = kl.makeDevices(pod, container)
+	if err != nil {
+		return nil, err
+	}

 	opts.Mounts, err = makeMounts(pod, kl.getPodDir(pod.UID), container, hostname, hostDomainName, podIP, volumes)
 	if err != nil {
--- a/pkg/kubelet/kubelet_pods_test.go
+++ b/pkg/kubelet/kubelet_pods_test.go
@ -27,7 +27,6 @@ import (
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
-	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/labels"
 	"k8s.io/apimachinery/pkg/runtime"
@ -1711,39 +1710,6 @@ func TestGetHostPortConflicts(t *testing.T) {
 	assert.True(t, hasHostPortConflicts(pods), "Should have port conflicts")
 }

-func TestMakeDevices(t *testing.T) {
-	testCases := []struct {
-		container *v1.Container
-		devices   []kubecontainer.DeviceInfo
-		test      string
-	}{
-		{
-			test:      "no device",
-			container: &v1.Container{},
-			devices:   nil,
-		},
-		{
-			test: "gpu",
-			container: &v1.Container{
-				Resources: v1.ResourceRequirements{
-					Limits: map[v1.ResourceName]resource.Quantity{
-						v1.ResourceNvidiaGPU: resource.MustParse("1000"),
-					},
-				},
-			},
-			devices: []kubecontainer.DeviceInfo{
-				{PathOnHost: "/dev/nvidia0", PathInContainer: "/dev/nvidia0", Permissions: "mrw"},
-				{PathOnHost: "/dev/nvidiactl", PathInContainer: "/dev/nvidiactl", Permissions: "mrw"},
-				{PathOnHost: "/dev/nvidia-uvm", PathInContainer: "/dev/nvidia-uvm", Permissions: "mrw"},
-			},
-		},
-	}
-
-	for _, test := range testCases {
-		assert.Equal(t, test.devices, makeDevices(test.container), "[test %q]", test.test)
-	}
-}
-
 func TestHasHostMountPVC(t *testing.T) {
 	tests := map[string]struct {
 		pvError       error
--- a/pkg/kubelet/kubelet_test.go
+++ b/pkg/kubelet/kubelet_test.go
@ -49,6 +49,7 @@ import (
 	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
 	containertest "k8s.io/kubernetes/pkg/kubelet/container/testing"
 	"k8s.io/kubernetes/pkg/kubelet/eviction"
+	"k8s.io/kubernetes/pkg/kubelet/gpu"
 	"k8s.io/kubernetes/pkg/kubelet/images"
 	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
 	"k8s.io/kubernetes/pkg/kubelet/network"
@ -272,7 +273,7 @@ func newTestKubeletWithImageList(

 	kubelet.AddPodSyncLoopHandler(activeDeadlineHandler)
 	kubelet.AddPodSyncHandler(activeDeadlineHandler)
-
+	kubelet.gpuManager = gpu.NewGPUManagerStub()
 	return &TestKubelet{kubelet, fakeRuntime, mockCadvisor, fakeKubeClient, fakeMirrorClient, fakeClock, nil, plug}
 }

--- a/pkg/kubemark/hollow_kubelet.go
+++ b/pkg/kubemark/hollow_kubelet.go
@ -150,7 +150,6 @@ func GetHollowKubeletConfig(
 	c.MaxContainerCount = 100
 	c.MaxOpenFiles = 1024
 	c.MaxPerPodContainerCount = 2
-	c.NvidiaGPUs = 0
 	c.RegisterNode = true
 	c.RegisterSchedulable = true
 	c.RegistryBurst = 10
--- a/test/e2e/framework/pods.go
+++ b/test/e2e/framework/pods.go
@ -224,7 +224,7 @@ func (c *PodClient) WaitForErrorEventOrSuccess(pod *v1.Pod) (*v1.Event, error) {
 	return ev, err
 }

-// MatchContainerOutput gest output of a container and match expected regexp in the output.
+// MatchContainerOutput gets output of a container and match expected regexp in the output.
 func (c *PodClient) MatchContainerOutput(name string, containerName string, expectedRegexp string) error {
 	f := c.f
 	output, err := GetPodLogs(f.ClientSet, f.Namespace.Name, name, containerName)
--- a/test/e2e_node/BUILD
+++ b/test/e2e_node/BUILD
@ -14,6 +14,7 @@ go_library(
        "benchmark_util.go",
        "container.go",
        "doc.go",
+        "gpus.go",
        "image_list.go",
        "resource_collector.go",
        "simple_mount.go",
@ -37,12 +38,14 @@ go_library(
        "//vendor:github.com/onsi/gomega",
        "//vendor:github.com/opencontainers/runc/libcontainer/cgroups",
        "//vendor:k8s.io/apimachinery/pkg/api/errors",
+        "//vendor:k8s.io/apimachinery/pkg/api/resource",
        "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
        "//vendor:k8s.io/apimachinery/pkg/labels",
        "//vendor:k8s.io/apimachinery/pkg/util/runtime",
        "//vendor:k8s.io/apimachinery/pkg/util/sets",
        "//vendor:k8s.io/apimachinery/pkg/util/uuid",
        "//vendor:k8s.io/apimachinery/pkg/util/wait",
+        "//vendor:k8s.io/client-go/pkg/api",
    ],
 )

--- a/test/e2e_node/gpus.go
+++ b/test/e2e_node/gpus.go
@ -0,0 +1,135 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package e2e_node
+
+import (
+	"fmt"
+	"time"
+
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/pkg/api"
+	"k8s.io/kubernetes/pkg/api/v1"
+	"k8s.io/kubernetes/pkg/apis/componentconfig"
+	"k8s.io/kubernetes/test/e2e/framework"
+
+	. "github.com/onsi/ginkgo"
+	. "github.com/onsi/gomega"
+)
+
+const acceleratorsFeatureGate = "Accelerators=true"
+
+// Serial because the test updates kubelet configuration.
+var _ = framework.KubeDescribe("GPU [Serial]", func() {
+	f := framework.NewDefaultFramework("gpu-test")
+	Context("attempt to use GPUs if available", func() {
+		It("setup the node and create pods to test gpus", func() {
+			By("ensuring that dynamic kubelet configuration is enabled")
+			enabled, err := isKubeletConfigEnabled(f)
+			framework.ExpectNoError(err)
+			if !enabled {
+				Skip("Dynamic Kubelet configuration is not enabled. Skipping test.")
+			}
+
+			By("enabling support for GPUs")
+			var oldCfg *componentconfig.KubeletConfiguration
+			defer func() {
+				if oldCfg != nil {
+					framework.ExpectNoError(setKubeletConfiguration(f, oldCfg))
+				}
+			}()
+
+			oldCfg, err = getCurrentKubeletConfig()
+			framework.ExpectNoError(err)
+			clone, err := api.Scheme.DeepCopy(oldCfg)
+			framework.ExpectNoError(err)
+			newCfg := clone.(*componentconfig.KubeletConfiguration)
+			if newCfg.FeatureGates != "" {
+				newCfg.FeatureGates = fmt.Sprintf("%s,%s", acceleratorsFeatureGate, newCfg.FeatureGates)
+			} else {
+				newCfg.FeatureGates = acceleratorsFeatureGate
+			}
+			framework.ExpectNoError(setKubeletConfiguration(f, newCfg))
+
+			By("Getting the local node object from the api server")
+			nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
+			framework.ExpectNoError(err, "getting node list")
+			Expect(len(nodeList.Items)).To(Equal(1))
+			node := nodeList.Items[0]
+			gpusAvailable := node.Status.Capacity.NvidiaGPU()
+			By("Skipping the test if GPUs aren't available")
+			if gpusAvailable.IsZero() {
+				Skip("No GPUs available on local node. Skipping test.")
+			}
+
+			By("Creating a pod that will consume all GPUs")
+			podSuccess := makePod(gpusAvailable.Value(), "gpus-success")
+			podSuccess = f.PodClient().CreateSync(podSuccess)
+
+			By("Checking if the pod outputted Success to its logs")
+			framework.ExpectNoError(f.PodClient().MatchContainerOutput(podSuccess.Name, podSuccess.Name, "Success"))
+
+			By("Creating a new pod requesting a GPU and noticing that it is rejected by the Kubelet")
+			podFailure := makePod(1, "gpu-failure")
+			framework.WaitForPodCondition(f.ClientSet, f.Namespace.Name, podFailure.Name, "pod rejected", framework.PodStartTimeout, func(pod *v1.Pod) (bool, error) {
+				if pod.Status.Phase == v1.PodFailed {
+					return true, nil
+
+				}
+				return false, nil
+			})
+
+			By("stopping the original Pod with GPUs")
+			gp := int64(0)
+			deleteOptions := metav1.DeleteOptions{
+				GracePeriodSeconds: &gp,
+			}
+			f.PodClient().DeleteSync(podSuccess.Name, &deleteOptions, 30*time.Second)
+
+			By("attempting to start the failed pod again")
+			f.PodClient().DeleteSync(podFailure.Name, &deleteOptions, 10*time.Second)
+			podFailure = f.PodClient().CreateSync(podFailure)
+
+			By("Checking if the pod outputted Success to its logs")
+			framework.ExpectNoError(f.PodClient().MatchContainerOutput(podFailure.Name, podFailure.Name, "Success"))
+		})
+	})
+})
+
+func makePod(gpus int64, name string) *v1.Pod {
+	resources := v1.ResourceRequirements{
+		Limits: v1.ResourceList{
+			v1.ResourceNvidiaGPU: *resource.NewQuantity(gpus, resource.DecimalSI),
+		},
+	}
+	gpuverificationCmd := fmt.Sprintf("if [[ %d -ne $(ls /dev/ | egrep '^nvidia[0-9]+$') ]]; then exit 1; fi; echo Success; sleep 10240 ", gpus)
+	return &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: name,
+		},
+		Spec: v1.PodSpec{
+			Containers: []v1.Container{
+				{
+					Image:     "gcr.io/google_containers/busybox:1.24",
+					Name:      name,
+					Command:   []string{"sh", "-c", gpuverificationCmd},
+					Resources: resources,
+				},
+			},
+		},
+	}
+}