diff --git a/cmd/kubelet/app/options/options.go b/cmd/kubelet/app/options/options.go index 4ef0d35f9e8..b9322fcd0ea 100644 --- a/cmd/kubelet/app/options/options.go +++ b/cmd/kubelet/app/options/options.go @@ -206,7 +206,6 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) { fs.BoolVar(&s.BabysitDaemons, "babysit-daemons", s.BabysitDaemons, "If true, the node has babysitter process monitoring docker and kubelet.") fs.MarkDeprecated("babysit-daemons", "Will be removed in a future version.") fs.Int32Var(&s.MaxPods, "max-pods", s.MaxPods, "Number of Pods that can run on this Kubelet.") - fs.Int32Var(&s.NvidiaGPUs, "experimental-nvidia-gpus", s.NvidiaGPUs, "Number of NVIDIA GPU devices on this node. Only 0 (default) and 1 are currently supported.") // TODO(#40229): Remove the docker-exec-handler flag. fs.StringVar(&s.DockerExecHandlerName, "docker-exec-handler", s.DockerExecHandlerName, "Handler to use when executing a command in a container. Valid values are 'native' and 'nsenter'. Defaults to 'native'.") fs.MarkDeprecated("docker-exec-handler", "this flag will be removed and only the 'native' handler will be supported in the future.") diff --git a/hack/verify-flags/known-flags.txt b/hack/verify-flags/known-flags.txt index b347c009ad9..fab69189935 100644 --- a/hack/verify-flags/known-flags.txt +++ b/hack/verify-flags/known-flags.txt @@ -690,3 +690,4 @@ windows-line-endings www-prefix zone-id zone-name + diff --git a/pkg/apis/componentconfig/types.go b/pkg/apis/componentconfig/types.go index 88ea80e8f2d..48f25628532 100644 --- a/pkg/apis/componentconfig/types.go +++ b/pkg/apis/componentconfig/types.go @@ -362,8 +362,6 @@ type KubeletConfiguration struct { BabysitDaemons bool // maxPods is the number of pods that can run on this Kubelet. MaxPods int32 - // nvidiaGPUs is the number of NVIDIA GPU devices on this node. - NvidiaGPUs int32 // dockerExecHandlerName is the handler to use when executing a command // in a container. Valid values are 'native' and 'nsenter'. Defaults to // 'native'. diff --git a/pkg/apis/componentconfig/v1alpha1/types.go b/pkg/apis/componentconfig/v1alpha1/types.go index 28b7499a3ef..0284b244343 100644 --- a/pkg/apis/componentconfig/v1alpha1/types.go +++ b/pkg/apis/componentconfig/v1alpha1/types.go @@ -407,8 +407,6 @@ type KubeletConfiguration struct { BabysitDaemons bool `json:"babysitDaemons"` // maxPods is the number of pods that can run on this Kubelet. MaxPods int32 `json:"maxPods"` - // nvidiaGPUs is the number of NVIDIA GPU devices on this node. - NvidiaGPUs int32 `json:"nvidiaGPUs"` // dockerExecHandlerName is the handler to use when executing a command // in a container. Valid values are 'native' and 'nsenter'. Defaults to // 'native'. diff --git a/pkg/apis/componentconfig/v1alpha1/zz_generated.conversion.go b/pkg/apis/componentconfig/v1alpha1/zz_generated.conversion.go index ac958c3bf93..cc7deafc92b 100644 --- a/pkg/apis/componentconfig/v1alpha1/zz_generated.conversion.go +++ b/pkg/apis/componentconfig/v1alpha1/zz_generated.conversion.go @@ -353,7 +353,6 @@ func autoConvert_v1alpha1_KubeletConfiguration_To_componentconfig_KubeletConfigu out.HairpinMode = in.HairpinMode out.BabysitDaemons = in.BabysitDaemons out.MaxPods = in.MaxPods - out.NvidiaGPUs = in.NvidiaGPUs out.DockerExecHandlerName = in.DockerExecHandlerName out.PodCIDR = in.PodCIDR out.ResolverConfig = in.ResolverConfig @@ -531,7 +530,6 @@ func autoConvert_componentconfig_KubeletConfiguration_To_v1alpha1_KubeletConfigu out.HairpinMode = in.HairpinMode out.BabysitDaemons = in.BabysitDaemons out.MaxPods = in.MaxPods - out.NvidiaGPUs = in.NvidiaGPUs out.DockerExecHandlerName = in.DockerExecHandlerName out.PodCIDR = in.PodCIDR out.ResolverConfig = in.ResolverConfig diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index 4bf9fedea5e..44a21d974d2 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -73,6 +73,14 @@ const ( // Determines if affinity defined in annotations should be processed // TODO: remove when alpha support for affinity is removed AffinityInAnnotations utilfeature.Feature = "AffinityInAnnotations" + + // owner: @vishh + // alpha: v1.6 + // + // Enables support for GPUs as a schedulable resource. + // Only Nvidia GPUs are supported as of v1.6. + // Works only with Docker Container Runtime. + Accelerators utilfeature.Feature = "Accelerators" ) func init() { @@ -90,6 +98,7 @@ var defaultKubernetesFeatureGates = map[utilfeature.Feature]utilfeature.FeatureS ExperimentalHostUserNamespaceDefaultingGate: {Default: false, PreRelease: utilfeature.Beta}, ExperimentalCriticalPodAnnotation: {Default: false, PreRelease: utilfeature.Alpha}, AffinityInAnnotations: {Default: false, PreRelease: utilfeature.Alpha}, + Accelerators: {Default: false, PreRelease: utilfeature.Alpha}, // inherited features from generic apiserver, relisted here to get a conflict if it is changed // unintentionally on either side: diff --git a/pkg/generated/openapi/zz_generated.openapi.go b/pkg/generated/openapi/zz_generated.openapi.go index 655e159249f..acd857829dc 100644 --- a/pkg/generated/openapi/zz_generated.openapi.go +++ b/pkg/generated/openapi/zz_generated.openapi.go @@ -13153,13 +13153,6 @@ func GetOpenAPIDefinitions(ref openapi.ReferenceCallback) map[string]openapi.Ope Format: "int32", }, }, - "nvidiaGPUs": { - SchemaProps: spec.SchemaProps{ - Description: "nvidiaGPUs is the number of NVIDIA GPU devices on this node.", - Type: []string{"integer"}, - Format: "int32", - }, - }, "dockerExecHandlerName": { SchemaProps: spec.SchemaProps{ Description: "dockerExecHandlerName is the handler to use when executing a command in a container. Valid values are 'native' and 'nsenter'. Defaults to 'native'.", @@ -13494,7 +13487,7 @@ func GetOpenAPIDefinitions(ref openapi.ReferenceCallback) map[string]openapi.Ope }, }, }, - Required: []string{"podManifestPath", "syncFrequency", "fileCheckFrequency", "httpCheckFrequency", "manifestURL", "manifestURLHeader", "enableServer", "address", "port", "readOnlyPort", "tlsCertFile", "tlsPrivateKeyFile", "certDirectory", "authentication", "authorization", "hostnameOverride", "podInfraContainerImage", "dockerEndpoint", "rootDirectory", "seccompProfileRoot", "allowPrivileged", "hostNetworkSources", "hostPIDSources", "hostIPCSources", "registryPullQPS", "registryBurst", "eventRecordQPS", "eventBurst", "enableDebuggingHandlers", "minimumGCAge", "maxPerPodContainerCount", "maxContainerCount", "cAdvisorPort", "healthzPort", "healthzBindAddress", "oomScoreAdj", "registerNode", "clusterDomain", "masterServiceNamespace", "clusterDNS", "streamingConnectionIdleTimeout", "nodeStatusUpdateFrequency", "imageMinimumGCAge", "imageGCHighThresholdPercent", "imageGCLowThresholdPercent", "lowDiskSpaceThresholdMB", "volumeStatsAggPeriod", "networkPluginName", "networkPluginDir", "cniConfDir", "cniBinDir", "networkPluginMTU", "volumePluginDir", "cloudProvider", "cloudConfigFile", "kubeletCgroups", "runtimeCgroups", "systemCgroups", "cgroupRoot", "containerRuntime", "remoteRuntimeEndpoint", "remoteImageEndpoint", "runtimeRequestTimeout", "rktPath", "rktAPIEndpoint", "rktStage1Image", "lockFilePath", "exitOnLockContention", "hairpinMode", "babysitDaemons", "maxPods", "nvidiaGPUs", "dockerExecHandlerName", "podCIDR", "resolvConf", "cpuCFSQuota", "containerized", "maxOpenFiles", "registerSchedulable", "registerWithTaints", "contentType", "kubeAPIQPS", "kubeAPIBurst", "serializeImagePulls", "outOfDiskTransitionFrequency", "nodeIP", "nodeLabels", "nonMasqueradeCIDR", "enableCustomMetrics", "evictionHard", "evictionSoft", "evictionSoftGracePeriod", "evictionPressureTransitionPeriod", "evictionMaxPodGracePeriod", "evictionMinimumReclaim", "experimentalKernelMemcgNotification", "podsPerCore", "enableControllerAttachDetach", "protectKernelDefaults", "makeIPTablesUtilChains", "iptablesMasqueradeBit", "iptablesDropBit", "systemReserved", "kubeReserved"}, + Required: []string{"podManifestPath", "syncFrequency", "fileCheckFrequency", "httpCheckFrequency", "manifestURL", "manifestURLHeader", "enableServer", "address", "port", "readOnlyPort", "tlsCertFile", "tlsPrivateKeyFile", "certDirectory", "authentication", "authorization", "hostnameOverride", "podInfraContainerImage", "dockerEndpoint", "rootDirectory", "seccompProfileRoot", "allowPrivileged", "hostNetworkSources", "hostPIDSources", "hostIPCSources", "registryPullQPS", "registryBurst", "eventRecordQPS", "eventBurst", "enableDebuggingHandlers", "minimumGCAge", "maxPerPodContainerCount", "maxContainerCount", "cAdvisorPort", "healthzPort", "healthzBindAddress", "oomScoreAdj", "registerNode", "clusterDomain", "masterServiceNamespace", "clusterDNS", "streamingConnectionIdleTimeout", "nodeStatusUpdateFrequency", "imageMinimumGCAge", "imageGCHighThresholdPercent", "imageGCLowThresholdPercent", "lowDiskSpaceThresholdMB", "volumeStatsAggPeriod", "networkPluginName", "networkPluginDir", "cniConfDir", "cniBinDir", "networkPluginMTU", "volumePluginDir", "cloudProvider", "cloudConfigFile", "kubeletCgroups", "runtimeCgroups", "systemCgroups", "cgroupRoot", "containerRuntime", "remoteRuntimeEndpoint", "remoteImageEndpoint", "runtimeRequestTimeout", "rktPath", "rktAPIEndpoint", "rktStage1Image", "lockFilePath", "exitOnLockContention", "hairpinMode", "babysitDaemons", "maxPods", "dockerExecHandlerName", "podCIDR", "resolvConf", "cpuCFSQuota", "containerized", "maxOpenFiles", "registerSchedulable", "registerWithTaints", "contentType", "kubeAPIQPS", "kubeAPIBurst", "serializeImagePulls", "outOfDiskTransitionFrequency", "nodeIP", "nodeLabels", "nonMasqueradeCIDR", "enableCustomMetrics", "evictionHard", "evictionSoft", "evictionSoftGracePeriod", "evictionPressureTransitionPeriod", "evictionMaxPodGracePeriod", "evictionMinimumReclaim", "experimentalKernelMemcgNotification", "podsPerCore", "enableControllerAttachDetach", "protectKernelDefaults", "makeIPTablesUtilChains", "iptablesMasqueradeBit", "iptablesDropBit", "systemReserved", "kubeReserved"}, }, }, Dependencies: []string{ diff --git a/pkg/kubelet/BUILD b/pkg/kubelet/BUILD index d412f34fcb3..848f44146c7 100644 --- a/pkg/kubelet/BUILD +++ b/pkg/kubelet/BUILD @@ -58,6 +58,8 @@ go_library( "//pkg/kubelet/envvars:go_default_library", "//pkg/kubelet/events:go_default_library", "//pkg/kubelet/eviction:go_default_library", + "//pkg/kubelet/gpu:go_default_library", + "//pkg/kubelet/gpu/nvidia:go_default_library", "//pkg/kubelet/images:go_default_library", "//pkg/kubelet/kuberuntime:go_default_library", "//pkg/kubelet/lifecycle:go_default_library", @@ -169,6 +171,7 @@ go_test( "//pkg/kubelet/container:go_default_library", "//pkg/kubelet/container/testing:go_default_library", "//pkg/kubelet/eviction:go_default_library", + "//pkg/kubelet/gpu:go_default_library", "//pkg/kubelet/images:go_default_library", "//pkg/kubelet/lifecycle:go_default_library", "//pkg/kubelet/network:go_default_library", @@ -246,6 +249,7 @@ filegroup( "//pkg/kubelet/envvars:all-srcs", "//pkg/kubelet/events:all-srcs", "//pkg/kubelet/eviction:all-srcs", + "//pkg/kubelet/gpu:all-srcs", "//pkg/kubelet/images:all-srcs", "//pkg/kubelet/kuberuntime:all-srcs", "//pkg/kubelet/leaky:all-srcs", diff --git a/pkg/kubelet/gpu/BUILD b/pkg/kubelet/gpu/BUILD new file mode 100644 index 00000000000..9c0ba77ae6c --- /dev/null +++ b/pkg/kubelet/gpu/BUILD @@ -0,0 +1,34 @@ +package(default_visibility = ["//visibility:public"]) + +licenses(["notice"]) + +load( + "@io_bazel_rules_go//go:def.bzl", + "go_library", +) + +go_library( + name = "go_default_library", + srcs = [ + "gpu_manager_stub.go", + "types.go", + ], + tags = ["automanaged"], + deps = ["//pkg/api/v1:go_default_library"], +) + +filegroup( + name = "package-srcs", + srcs = glob(["**"]), + tags = ["automanaged"], + visibility = ["//visibility:private"], +) + +filegroup( + name = "all-srcs", + srcs = [ + ":package-srcs", + "//pkg/kubelet/gpu/nvidia:all-srcs", + ], + tags = ["automanaged"], +) diff --git a/pkg/kubelet/gpu/gpu_manager_stub.go b/pkg/kubelet/gpu/gpu_manager_stub.go new file mode 100644 index 00000000000..a21b5feb667 --- /dev/null +++ b/pkg/kubelet/gpu/gpu_manager_stub.go @@ -0,0 +1,41 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpu + +import ( + "fmt" + + "k8s.io/kubernetes/pkg/api/v1" +) + +type gpuManagerStub struct{} + +func (gms *gpuManagerStub) Start() error { + return nil +} + +func (gms *gpuManagerStub) Capacity() v1.ResourceList { + return nil +} + +func (gms *gpuManagerStub) AllocateGPU(_ *v1.Pod, _ *v1.Container) ([]string, error) { + return nil, fmt.Errorf("GPUs are not supported") +} + +func NewGPUManagerStub() GPUManager { + return &gpuManagerStub{} +} diff --git a/pkg/kubelet/gpu/nvidia/BUILD b/pkg/kubelet/gpu/nvidia/BUILD new file mode 100644 index 00000000000..e9827578ac5 --- /dev/null +++ b/pkg/kubelet/gpu/nvidia/BUILD @@ -0,0 +1,54 @@ +package(default_visibility = ["//visibility:public"]) + +licenses(["notice"]) + +load( + "@io_bazel_rules_go//go:def.bzl", + "go_library", + "go_test", +) + +go_library( + name = "go_default_library", + srcs = [ + "helpers.go", + "nvidia_gpu_manager.go", + ], + tags = ["automanaged"], + deps = [ + "//pkg/api/v1:go_default_library", + "//pkg/kubelet/dockertools:go_default_library", + "//pkg/kubelet/gpu:go_default_library", + "//vendor:github.com/golang/glog", + "//vendor:k8s.io/apimachinery/pkg/api/resource", + "//vendor:k8s.io/apimachinery/pkg/util/sets", + ], +) + +filegroup( + name = "package-srcs", + srcs = glob(["**"]), + tags = ["automanaged"], + visibility = ["//visibility:private"], +) + +filegroup( + name = "all-srcs", + srcs = [":package-srcs"], + tags = ["automanaged"], +) + +go_test( + name = "go_default_test", + srcs = ["nvidia_gpu_manager_test.go"], + library = ":go_default_library", + tags = ["automanaged"], + deps = [ + "//pkg/api/v1:go_default_library", + "//vendor:github.com/stretchr/testify/assert", + "//vendor:k8s.io/apimachinery/pkg/api/resource", + "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1", + "//vendor:k8s.io/apimachinery/pkg/util/sets", + "//vendor:k8s.io/apimachinery/pkg/util/uuid", + ], +) diff --git a/pkg/kubelet/gpu/nvidia/helpers.go b/pkg/kubelet/gpu/nvidia/helpers.go new file mode 100644 index 00000000000..0abedbf4014 --- /dev/null +++ b/pkg/kubelet/gpu/nvidia/helpers.go @@ -0,0 +1,59 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nvidia + +import "k8s.io/apimachinery/pkg/util/sets" + +// podGPUs represents a list of pod to GPU mappings. +type podGPUs struct { + podGPUMapping map[string]sets.String +} + +func newPodGPUs() *podGPUs { + return &podGPUs{ + podGPUMapping: map[string]sets.String{}, + } +} +func (pgpu *podGPUs) pods() sets.String { + ret := sets.NewString() + for k := range pgpu.podGPUMapping { + ret.Insert(k) + } + return ret +} + +func (pgpu *podGPUs) insert(podUID string, device string) { + if _, exists := pgpu.podGPUMapping[podUID]; !exists { + pgpu.podGPUMapping[podUID] = sets.NewString(device) + } else { + pgpu.podGPUMapping[podUID].Insert(device) + } +} + +func (pgpu *podGPUs) delete(pods []string) { + for _, uid := range pods { + delete(pgpu.podGPUMapping, uid) + } +} + +func (pgpu *podGPUs) devices() sets.String { + ret := sets.NewString() + for _, devices := range pgpu.podGPUMapping { + ret = ret.Union(devices) + } + return ret +} diff --git a/pkg/kubelet/gpu/nvidia/nvidia_gpu_manager.go b/pkg/kubelet/gpu/nvidia/nvidia_gpu_manager.go new file mode 100644 index 00000000000..43b0e3b32a0 --- /dev/null +++ b/pkg/kubelet/gpu/nvidia/nvidia_gpu_manager.go @@ -0,0 +1,279 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nvidia + +import ( + "fmt" + "io/ioutil" + "os" + "path" + "regexp" + "sync" + + "github.com/golang/glog" + + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/kubernetes/pkg/api/v1" + "k8s.io/kubernetes/pkg/kubelet/dockertools" + "k8s.io/kubernetes/pkg/kubelet/gpu" +) + +// TODO: rework to use Nvidia's NVML, which is more complex, but also provides more fine-grained information and stats. +const ( + // All NVIDIA GPUs cards should be mounted with nvidiactl and nvidia-uvm + // If the driver installed correctly, the 2 devices will be there. + nvidiaCtlDevice string = "/dev/nvidiactl" + nvidiaUVMDevice string = "/dev/nvidia-uvm" + // Optional device. + nvidiaUVMToolsDevice string = "/dev/nvidia-uvm-tools" + devDirectory = "/dev" + nvidiaDeviceRE = `^nvidia[0-9]*$` + nvidiaFullpathRE = `^/dev/nvidia[0-9]*$` +) + +type activePodsLister interface { + // Returns a list of active pods on the node. + GetRunningPods() ([]*v1.Pod, error) +} + +// nvidiaGPUManager manages nvidia gpu devices. +type nvidiaGPUManager struct { + sync.Mutex + // All gpus available on the Node + allGPUs sets.String + allocated *podGPUs + defaultDevices []string + // The interface which could get GPU mapping from all the containers. + // TODO: Should make this independent of Docker in the future. + dockerClient dockertools.DockerInterface + activePodsLister activePodsLister +} + +// NewNvidiaGPUManager returns a GPUManager that manages local Nvidia GPUs. +// TODO: Migrate to use pod level cgroups and make it generic to all runtimes. +func NewNvidiaGPUManager(activePodsLister activePodsLister, dockerClient dockertools.DockerInterface) (gpu.GPUManager, error) { + if dockerClient == nil { + return nil, fmt.Errorf("invalid docker client specified") + } + return &nvidiaGPUManager{ + allGPUs: sets.NewString(), + dockerClient: dockerClient, + activePodsLister: activePodsLister, + }, nil +} + +// Initialize the GPU devices, so far only needed to discover the GPU paths. +func (ngm *nvidiaGPUManager) Start() error { + if ngm.dockerClient == nil { + return fmt.Errorf("Invalid docker client specified in GPU Manager") + } + ngm.Lock() + defer ngm.Unlock() + + if _, err := os.Stat(nvidiaCtlDevice); err != nil { + return err + } + + if _, err := os.Stat(nvidiaUVMDevice); err != nil { + return err + } + ngm.defaultDevices = []string{nvidiaCtlDevice, nvidiaUVMDevice} + _, err := os.Stat(nvidiaUVMToolsDevice) + if !os.IsNotExist(err) { + ngm.defaultDevices = append(ngm.defaultDevices, nvidiaUVMToolsDevice) + } + + if err := ngm.discoverGPUs(); err != nil { + return err + } + // It's possible that the runtime isn't available now. + allocatedGPUs, err := ngm.gpusInUse() + if err == nil { + ngm.allocated = allocatedGPUs + } + // We ignore errors when identifying allocated GPUs because it is possible that the runtime interfaces may be not be logically up. + return nil +} + +// Get how many GPU cards we have. +func (ngm *nvidiaGPUManager) Capacity() v1.ResourceList { + gpus := resource.NewQuantity(int64(len(ngm.allGPUs)), resource.DecimalSI) + return v1.ResourceList{ + v1.ResourceNvidiaGPU: *gpus, + } +} + +// AllocateGPUs returns `num` GPUs if available, error otherwise. +// Allocation is made thread safe using the following logic. +// A list of all GPUs allocated is maintained along with their respective Pod UIDs. +// It is expected that the list of active pods will not return any false positives. +// As part of initialization or allocation, the list of GPUs in use will be computed once. +// Whenever an allocation happens, the list of GPUs allocated is updated based on the list of currently active pods. +// GPUs allocated to terminated pods are freed up lazily as part of allocation. +// GPUs are allocated based on the internal list of allocatedGPUs. +// It is not safe to generate a list of GPUs in use by inspecting active containers because of the delay between GPU allocation and container creation. +// A GPU allocated to a container might be re-allocated to a subsequent container because the original container wasn't started quick enough. +// The current algorithm scans containers only once and then uses a list of active pods to track GPU usage. +// This is a sub-optimal solution and a better alternative would be that of using pod level cgroups instead. +// GPUs allocated to containers should be reflected in pod level device cgroups before completing allocations. +// The pod level cgroups will then serve as a checkpoint of GPUs in use. +func (ngm *nvidiaGPUManager) AllocateGPU(pod *v1.Pod, container *v1.Container) ([]string, error) { + gpusNeeded := container.Resources.Limits.NvidiaGPU().Value() + if gpusNeeded == 0 { + return []string{}, nil + } + ngm.Lock() + defer ngm.Unlock() + if ngm.allocated == nil { + // Initialization is not complete. Try now. Failures can no longer be tolerated. + allocated, err := ngm.gpusInUse() + if err != nil { + return nil, fmt.Errorf("Failed to allocate GPUs because of issues identifying GPUs in use: %v", err) + } + ngm.allocated = allocated + } else { + // update internal list of GPUs in use prior to allocating new GPUs. + if err := ngm.updateAllocatedGPUs(); err != nil { + return nil, fmt.Errorf("Failed to allocate GPUs because of issues with updating GPUs in use: %v", err) + } + } + // Get GPU devices in use. + devicesInUse := ngm.allocated.devices() + glog.V(5).Infof("gpus in use: %v", devicesInUse.List()) + // Get a list of available GPUs. + available := ngm.allGPUs.Difference(devicesInUse) + glog.V(5).Infof("gpus available: %v", available.List()) + if int64(available.Len()) < gpusNeeded { + return nil, fmt.Errorf("requested number of GPUs unavailable. Requested: %d, Available: %d", gpusNeeded, available.Len()) + } + ret := available.UnsortedList()[:gpusNeeded] + for _, device := range ret { + // Update internal allocated GPU cache. + ngm.allocated.insert(string(pod.UID), device) + } + // Add standard devices files that needs to be exposed. + ret = append(ret, ngm.defaultDevices...) + + return ret, nil +} + +// updateAllocatedGPUs updates the list of GPUs in use. +// It gets a list of running pods and then frees any GPUs that are bound to terminated pods. +// Returns error on failure. +func (ngm *nvidiaGPUManager) updateAllocatedGPUs() error { + activePods, err := ngm.activePodsLister.GetRunningPods() + if err != nil { + return fmt.Errorf("Failed to list active pods: %v", err) + } + activePodUids := sets.NewString() + for _, pod := range activePods { + activePodUids.Insert(string(pod.UID)) + } + allocatedPodUids := ngm.allocated.pods() + podsToBeRemoved := allocatedPodUids.Difference(activePodUids) + glog.V(5).Infof("pods to be removed: %v", podsToBeRemoved.List()) + ngm.allocated.delete(podsToBeRemoved.List()) + return nil +} + +// discoverGPUs identifies allGPUs NVIDIA GPU devices available on the local node by walking `/dev` directory. +// TODO: Without NVML support we only can check whether there has GPU devices, but +// could not give a health check or get more information like GPU cores, memory, or +// family name. Need to support NVML in the future. But we do not need NVML until +// we want more features, features like schedule containers according to GPU family +// name. +func (ngm *nvidiaGPUManager) discoverGPUs() error { + reg := regexp.MustCompile(nvidiaDeviceRE) + files, err := ioutil.ReadDir(devDirectory) + if err != nil { + return err + } + for _, f := range files { + if f.IsDir() { + continue + } + if reg.MatchString(f.Name()) { + glog.V(2).Infof("Found Nvidia GPU %q", f.Name()) + ngm.allGPUs.Insert(path.Join(devDirectory, f.Name())) + } + } + + return nil +} + +// gpusInUse returns a list of GPUs in use along with the respective pods that are using it. +func (ngm *nvidiaGPUManager) gpusInUse() (*podGPUs, error) { + pods, err := ngm.activePodsLister.GetRunningPods() + if err != nil { + return nil, err + } + type podContainers struct { + uid string + containerIDs sets.String + } + // List of containers to inspect. + podContainersToInspect := []podContainers{} + for _, pod := range pods { + containers := sets.NewString() + for _, container := range pod.Spec.Containers { + // GPUs are expected to be specified only in limits. + if !container.Resources.Limits.NvidiaGPU().IsZero() { + containers.Insert(container.Name) + } + } + // If no GPUs were requested skip this pod. + if containers.Len() == 0 { + continue + } + containerIDs := sets.NewString() + for _, container := range pod.Status.ContainerStatuses { + if containers.Has(container.Name) { + containerIDs.Insert(container.ContainerID) + } + } + // add the pod and its containers that need to be inspected. + podContainersToInspect = append(podContainersToInspect, podContainers{string(pod.UID), containerIDs}) + } + ret := newPodGPUs() + for _, podContainer := range podContainersToInspect { + for _, containerId := range podContainer.containerIDs.List() { + containerJSON, err := ngm.dockerClient.InspectContainer(containerId) + if err != nil { + glog.V(3).Infof("Failed to inspect container %q in pod %q while attempting to reconcile nvidia gpus in use", containerId, podContainer.uid) + continue + } + + devices := containerJSON.HostConfig.Devices + if devices == nil { + continue + } + + for _, device := range devices { + if isValidPath(device.PathOnHost) { + glog.V(4).Infof("Nvidia GPU %q is in use by Docker Container: %q", device.PathOnHost, containerJSON.ID) + ret.insert(podContainer.uid, device.PathOnHost) + } + } + } + } + return ret, nil +} + +func isValidPath(path string) bool { + return regexp.MustCompile(nvidiaFullpathRE).MatchString(path) +} diff --git a/pkg/kubelet/gpu/nvidia/nvidia_gpu_manager_test.go b/pkg/kubelet/gpu/nvidia/nvidia_gpu_manager_test.go new file mode 100644 index 00000000000..aea168ba568 --- /dev/null +++ b/pkg/kubelet/gpu/nvidia/nvidia_gpu_manager_test.go @@ -0,0 +1,144 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nvidia + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/apimachinery/pkg/util/uuid" + "k8s.io/kubernetes/pkg/api/v1" +) + +type testActivePodsLister struct { + activePods []*v1.Pod +} + +func (tapl *testActivePodsLister) GetRunningPods() ([]*v1.Pod, error) { + return tapl.activePods, nil +} + +func makeTestPod(numContainers int) *v1.Pod { + quantity := resource.NewQuantity(1, resource.DecimalSI) + resources := v1.ResourceRequirements{ + Limits: v1.ResourceList{ + v1.ResourceNvidiaGPU: *quantity, + }, + } + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + UID: uuid.NewUUID(), + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{}, + }, + } + for ; numContainers > 0; numContainers-- { + pod.Spec.Containers = append(pod.Spec.Containers, v1.Container{ + Resources: resources, + }) + } + return pod +} + +func TestMultiContainerPodGPUAllocation(t *testing.T) { + podLister := &testActivePodsLister{} + + testGpuManager := &nvidiaGPUManager{ + activePodsLister: podLister, + allGPUs: sets.NewString("/dev/nvidia0", "/dev/nvidia1"), + allocated: newPodGPUs(), + } + + // Expect that no devices are in use. + gpusInUse, err := testGpuManager.gpusInUse() + as := assert.New(t) + as.Nil(err) + as.Equal(len(gpusInUse.devices()), 0) + + // Allocated GPUs for a pod with two containers. + pod := makeTestPod(2) + // Allocate for the first container. + devices1, err := testGpuManager.AllocateGPU(pod, &pod.Spec.Containers[0]) + as.Nil(err) + as.Equal(len(devices1), 1) + + podLister.activePods = append(podLister.activePods, pod) + // Allocate for the second container. + devices2, err := testGpuManager.AllocateGPU(pod, &pod.Spec.Containers[1]) + as.Nil(err) + as.Equal(len(devices2), 1) + + as.NotEqual(devices1, devices2, "expected containers to get different devices") + + // further allocations should fail. + newPod := makeTestPod(2) + devices1, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[0]) + as.NotNil(err, "expected gpu allocation to fail. got: %v", devices1) + + // Now terminate the original pod and observe that GPU allocation for new pod succeeds. + podLister.activePods = podLister.activePods[:0] + + devices1, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[0]) + as.Nil(err) + as.Equal(len(devices1), 1) + + podLister.activePods = append(podLister.activePods, newPod) + + devices2, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[1]) + as.Nil(err) + as.Equal(len(devices2), 1) + + as.NotEqual(devices1, devices2, "expected containers to get different devices") +} + +func TestMultiPodGPUAllocation(t *testing.T) { + podLister := &testActivePodsLister{} + + testGpuManager := &nvidiaGPUManager{ + activePodsLister: podLister, + allGPUs: sets.NewString("/dev/nvidia0", "/dev/nvidia1"), + allocated: newPodGPUs(), + } + + // Expect that no devices are in use. + gpusInUse, err := testGpuManager.gpusInUse() + as := assert.New(t) + as.Nil(err) + as.Equal(len(gpusInUse.devices()), 0) + + // Allocated GPUs for a pod with two containers. + podA := makeTestPod(1) + // Allocate for the first container. + devicesA, err := testGpuManager.AllocateGPU(podA, &podA.Spec.Containers[0]) + as.Nil(err) + as.Equal(len(devicesA), 1) + + podLister.activePods = append(podLister.activePods, podA) + + // further allocations should fail. + podB := makeTestPod(1) + // Allocate for the first container. + devicesB, err := testGpuManager.AllocateGPU(podB, &podB.Spec.Containers[0]) + as.Nil(err) + as.Equal(len(devicesB), 1) + as.NotEqual(devicesA, devicesB, "expected pods to get different devices") +} diff --git a/pkg/kubelet/gpu/types.go b/pkg/kubelet/gpu/types.go new file mode 100644 index 00000000000..afc01844e0c --- /dev/null +++ b/pkg/kubelet/gpu/types.go @@ -0,0 +1,32 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpu + +import "k8s.io/kubernetes/pkg/api/v1" + +// GPUManager manages GPUs on a local node. +// Implementations are expected to be thread safe. +type GPUManager interface { + // Start logically initializes GPUManager + Start() error + // Capacity returns the total number of GPUs on the node. + Capacity() v1.ResourceList + // AllocateGPU attempts to allocate GPUs for input container. + // Returns paths to allocated GPUs and nil on success. + // Returns an error on failure. + AllocateGPU(*v1.Pod, *v1.Container) ([]string, error) +} diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index b89e7fcd96e..0f155389cbe 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -67,6 +67,8 @@ import ( "k8s.io/kubernetes/pkg/kubelet/dockertools" "k8s.io/kubernetes/pkg/kubelet/events" "k8s.io/kubernetes/pkg/kubelet/eviction" + "k8s.io/kubernetes/pkg/kubelet/gpu" + "k8s.io/kubernetes/pkg/kubelet/gpu/nvidia" "k8s.io/kubernetes/pkg/kubelet/images" "k8s.io/kubernetes/pkg/kubelet/kuberuntime" "k8s.io/kubernetes/pkg/kubelet/lifecycle" @@ -450,7 +452,6 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub nonMasqueradeCIDR: kubeCfg.NonMasqueradeCIDR, maxPods: int(kubeCfg.MaxPods), podsPerCore: int(kubeCfg.PodsPerCore), - nvidiaGPUs: int(kubeCfg.NvidiaGPUs), syncLoopMonitor: atomic.Value{}, resolverConfig: kubeCfg.ResolverConfig, cpuCFSQuota: kubeCfg.CPUCFSQuota, @@ -786,7 +787,16 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub klet.appArmorValidator = apparmor.NewValidator(kubeCfg.ContainerRuntime) klet.softAdmitHandlers.AddPodAdmitHandler(lifecycle.NewAppArmorAdmitHandler(klet.appArmorValidator)) - + if utilfeature.DefaultFeatureGate.Enabled(features.Accelerators) { + if kubeCfg.ContainerRuntime != "docker" { + return nil, fmt.Errorf("Accelerators feature is supported with docker runtime only.") + } + if klet.gpuManager, err = nvidia.NewNvidiaGPUManager(klet, klet.dockerClient); err != nil { + return nil, err + } + } else { + klet.gpuManager = gpu.NewGPUManagerStub() + } // Finally, put the most recent version of the config on the Kubelet, so // people can see how it was configured. klet.kubeletConfiguration = *kubeCfg @@ -981,9 +991,6 @@ type Kubelet struct { // Maximum Number of Pods which can be run by this Kubelet maxPods int - // Number of NVIDIA GPUs on this node - nvidiaGPUs int - // Monitor Kubelet's sync loop syncLoopMonitor atomic.Value @@ -1089,6 +1096,9 @@ type Kubelet struct { // This should only be enabled when the container runtime is performing user remapping AND if the // experimental behavior is desired. experimentalHostUserNamespaceDefaulting bool + + // GPU Manager + gpuManager gpu.GPUManager } // setupDataDirs creates: @@ -1182,7 +1192,10 @@ func (kl *Kubelet) initializeModules() error { return fmt.Errorf("Failed to start OOM watcher %v", err) } - // Step 7: Start resource analyzer + // Step 7: Initialize GPUs + kl.gpuManager.Start() + + // Step 8: Start resource analyzer kl.resourceAnalyzer.Start() return nil diff --git a/pkg/kubelet/kubelet_node_status.go b/pkg/kubelet/kubelet_node_status.go index 50b1aeffc80..791effef0fd 100644 --- a/pkg/kubelet/kubelet_node_status.go +++ b/pkg/kubelet/kubelet_node_status.go @@ -482,6 +482,14 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) { node.Status.Capacity = v1.ResourceList{} } + // populate GPU capacity. + gpuCapacity := kl.gpuManager.Capacity() + if gpuCapacity != nil { + for k, v := range gpuCapacity { + node.Status.Capacity[k] = v + } + } + // TODO: Post NotReady if we cannot get MachineInfo from cAdvisor. This needs to start // cAdvisor locally, e.g. for test-cmd.sh, and in integration test. info, err := kl.GetCachedMachineInfo() @@ -491,8 +499,6 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) { node.Status.Capacity[v1.ResourceCPU] = *resource.NewMilliQuantity(0, resource.DecimalSI) node.Status.Capacity[v1.ResourceMemory] = resource.MustParse("0Gi") node.Status.Capacity[v1.ResourcePods] = *resource.NewQuantity(int64(kl.maxPods), resource.DecimalSI) - node.Status.Capacity[v1.ResourceNvidiaGPU] = *resource.NewQuantity(int64(kl.nvidiaGPUs), resource.DecimalSI) - glog.Errorf("Error getting machine info: %v", err) } else { node.Status.NodeInfo.MachineID = info.MachineID @@ -509,8 +515,6 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) { node.Status.Capacity[v1.ResourcePods] = *resource.NewQuantity( int64(kl.maxPods), resource.DecimalSI) } - node.Status.Capacity[v1.ResourceNvidiaGPU] = *resource.NewQuantity( - int64(kl.nvidiaGPUs), resource.DecimalSI) if node.Status.NodeInfo.BootID != "" && node.Status.NodeInfo.BootID != info.BootID { // TODO: This requires a transaction, either both node status is updated diff --git a/pkg/kubelet/kubelet_node_status_test.go b/pkg/kubelet/kubelet_node_status_test.go index a77622728b8..1f57051a932 100644 --- a/pkg/kubelet/kubelet_node_status_test.go +++ b/pkg/kubelet/kubelet_node_status_test.go @@ -208,16 +208,14 @@ func TestUpdateNewNodeStatus(t *testing.T) { KubeProxyVersion: version.Get().String(), }, Capacity: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), - v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, Allocatable: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(9900E6, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), - v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(9900E6, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, Addresses: []v1.NodeAddress{ {Type: v1.NodeLegacyHostIP, Address: "127.0.0.1"}, @@ -402,10 +400,9 @@ func TestUpdateExistingNodeStatus(t *testing.T) { v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, Allocatable: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(2800, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), - v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(2800, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, }, } @@ -482,16 +479,14 @@ func TestUpdateExistingNodeStatus(t *testing.T) { KubeProxyVersion: version.Get().String(), }, Capacity: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(20E9, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), - v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(20E9, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, Allocatable: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), - v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, Addresses: []v1.NodeAddress{ {Type: v1.NodeLegacyHostIP, Address: "127.0.0.1"}, @@ -790,16 +785,14 @@ func TestUpdateNodeStatusWithRuntimeStateError(t *testing.T) { KubeProxyVersion: version.Get().String(), }, Capacity: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), - v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, Allocatable: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(9900E6, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), - v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(9900E6, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, Addresses: []v1.NodeAddress{ {Type: v1.NodeLegacyHostIP, Address: "127.0.0.1"}, diff --git a/pkg/kubelet/kubelet_pods.go b/pkg/kubelet/kubelet_pods.go index 8cdbe3ae138..a6e85c723f5 100644 --- a/pkg/kubelet/kubelet_pods.go +++ b/pkg/kubelet/kubelet_pods.go @@ -84,20 +84,23 @@ func (kl *Kubelet) getActivePods() []*v1.Pod { } // makeDevices determines the devices for the given container. -// Experimental. For now, we hardcode /dev/nvidia0 no matter what the user asks for -// (we only support one device per node). -// TODO: add support for more than 1 GPU after #28216. -func makeDevices(container *v1.Container) []kubecontainer.DeviceInfo { - nvidiaGPULimit := container.Resources.Limits.NvidiaGPU() - if nvidiaGPULimit.Value() != 0 { - return []kubecontainer.DeviceInfo{ - {PathOnHost: "/dev/nvidia0", PathInContainer: "/dev/nvidia0", Permissions: "mrw"}, - {PathOnHost: "/dev/nvidiactl", PathInContainer: "/dev/nvidiactl", Permissions: "mrw"}, - {PathOnHost: "/dev/nvidia-uvm", PathInContainer: "/dev/nvidia-uvm", Permissions: "mrw"}, - } +// Experimental. +func (kl *Kubelet) makeDevices(pod *v1.Pod, container *v1.Container) ([]kubecontainer.DeviceInfo, error) { + if container.Resources.Limits.NvidiaGPU().IsZero() { + return nil, nil } - return nil + nvidiaGPUPaths, err := kl.gpuManager.AllocateGPU(pod, container) + if err != nil { + return nil, err + } + var devices []kubecontainer.DeviceInfo + for _, path := range nvidiaGPUPaths { + // Devices have to be mapped one to one because of nvidia CUDA library requirements. + devices = append(devices, kubecontainer.DeviceInfo{PathOnHost: path, PathInContainer: path, Permissions: "mrw"}) + } + + return devices, nil } // makeMounts determines the mount points for the given container. @@ -285,7 +288,10 @@ func (kl *Kubelet) GenerateRunContainerOptions(pod *v1.Pod, container *v1.Contai opts.PortMappings = kubecontainer.MakePortMappings(container) // TODO(random-liu): Move following convert functions into pkg/kubelet/container - opts.Devices = makeDevices(container) + opts.Devices, err = kl.makeDevices(pod, container) + if err != nil { + return nil, err + } opts.Mounts, err = makeMounts(pod, kl.getPodDir(pod.UID), container, hostname, hostDomainName, podIP, volumes) if err != nil { diff --git a/pkg/kubelet/kubelet_pods_test.go b/pkg/kubelet/kubelet_pods_test.go index 6f88b805049..7c7a2cb7068 100644 --- a/pkg/kubelet/kubelet_pods_test.go +++ b/pkg/kubelet/kubelet_pods_test.go @@ -27,7 +27,6 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" apierrors "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" @@ -1711,39 +1710,6 @@ func TestGetHostPortConflicts(t *testing.T) { assert.True(t, hasHostPortConflicts(pods), "Should have port conflicts") } -func TestMakeDevices(t *testing.T) { - testCases := []struct { - container *v1.Container - devices []kubecontainer.DeviceInfo - test string - }{ - { - test: "no device", - container: &v1.Container{}, - devices: nil, - }, - { - test: "gpu", - container: &v1.Container{ - Resources: v1.ResourceRequirements{ - Limits: map[v1.ResourceName]resource.Quantity{ - v1.ResourceNvidiaGPU: resource.MustParse("1000"), - }, - }, - }, - devices: []kubecontainer.DeviceInfo{ - {PathOnHost: "/dev/nvidia0", PathInContainer: "/dev/nvidia0", Permissions: "mrw"}, - {PathOnHost: "/dev/nvidiactl", PathInContainer: "/dev/nvidiactl", Permissions: "mrw"}, - {PathOnHost: "/dev/nvidia-uvm", PathInContainer: "/dev/nvidia-uvm", Permissions: "mrw"}, - }, - }, - } - - for _, test := range testCases { - assert.Equal(t, test.devices, makeDevices(test.container), "[test %q]", test.test) - } -} - func TestHasHostMountPVC(t *testing.T) { tests := map[string]struct { pvError error diff --git a/pkg/kubelet/kubelet_test.go b/pkg/kubelet/kubelet_test.go index 2c49e7085e3..6dfb43e8c74 100644 --- a/pkg/kubelet/kubelet_test.go +++ b/pkg/kubelet/kubelet_test.go @@ -49,6 +49,7 @@ import ( kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" containertest "k8s.io/kubernetes/pkg/kubelet/container/testing" "k8s.io/kubernetes/pkg/kubelet/eviction" + "k8s.io/kubernetes/pkg/kubelet/gpu" "k8s.io/kubernetes/pkg/kubelet/images" "k8s.io/kubernetes/pkg/kubelet/lifecycle" "k8s.io/kubernetes/pkg/kubelet/network" @@ -272,7 +273,7 @@ func newTestKubeletWithImageList( kubelet.AddPodSyncLoopHandler(activeDeadlineHandler) kubelet.AddPodSyncHandler(activeDeadlineHandler) - + kubelet.gpuManager = gpu.NewGPUManagerStub() return &TestKubelet{kubelet, fakeRuntime, mockCadvisor, fakeKubeClient, fakeMirrorClient, fakeClock, nil, plug} } diff --git a/pkg/kubemark/hollow_kubelet.go b/pkg/kubemark/hollow_kubelet.go index 92f3e329baf..ef38f444e58 100644 --- a/pkg/kubemark/hollow_kubelet.go +++ b/pkg/kubemark/hollow_kubelet.go @@ -150,7 +150,6 @@ func GetHollowKubeletConfig( c.MaxContainerCount = 100 c.MaxOpenFiles = 1024 c.MaxPerPodContainerCount = 2 - c.NvidiaGPUs = 0 c.RegisterNode = true c.RegisterSchedulable = true c.RegistryBurst = 10 diff --git a/test/e2e/framework/pods.go b/test/e2e/framework/pods.go index b9dd95ad33f..17430ddef48 100644 --- a/test/e2e/framework/pods.go +++ b/test/e2e/framework/pods.go @@ -224,7 +224,7 @@ func (c *PodClient) WaitForErrorEventOrSuccess(pod *v1.Pod) (*v1.Event, error) { return ev, err } -// MatchContainerOutput gest output of a container and match expected regexp in the output. +// MatchContainerOutput gets output of a container and match expected regexp in the output. func (c *PodClient) MatchContainerOutput(name string, containerName string, expectedRegexp string) error { f := c.f output, err := GetPodLogs(f.ClientSet, f.Namespace.Name, name, containerName) diff --git a/test/e2e_node/BUILD b/test/e2e_node/BUILD index 31cf23408a5..5408bad9009 100644 --- a/test/e2e_node/BUILD +++ b/test/e2e_node/BUILD @@ -14,6 +14,7 @@ go_library( "benchmark_util.go", "container.go", "doc.go", + "gpus.go", "image_list.go", "resource_collector.go", "simple_mount.go", @@ -37,12 +38,14 @@ go_library( "//vendor:github.com/onsi/gomega", "//vendor:github.com/opencontainers/runc/libcontainer/cgroups", "//vendor:k8s.io/apimachinery/pkg/api/errors", + "//vendor:k8s.io/apimachinery/pkg/api/resource", "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1", "//vendor:k8s.io/apimachinery/pkg/labels", "//vendor:k8s.io/apimachinery/pkg/util/runtime", "//vendor:k8s.io/apimachinery/pkg/util/sets", "//vendor:k8s.io/apimachinery/pkg/util/uuid", "//vendor:k8s.io/apimachinery/pkg/util/wait", + "//vendor:k8s.io/client-go/pkg/api", ], ) diff --git a/test/e2e_node/gpus.go b/test/e2e_node/gpus.go new file mode 100644 index 00000000000..d8c651f2e8c --- /dev/null +++ b/test/e2e_node/gpus.go @@ -0,0 +1,135 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e_node + +import ( + "fmt" + "time" + + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/pkg/api" + "k8s.io/kubernetes/pkg/api/v1" + "k8s.io/kubernetes/pkg/apis/componentconfig" + "k8s.io/kubernetes/test/e2e/framework" + + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" +) + +const acceleratorsFeatureGate = "Accelerators=true" + +// Serial because the test updates kubelet configuration. +var _ = framework.KubeDescribe("GPU [Serial]", func() { + f := framework.NewDefaultFramework("gpu-test") + Context("attempt to use GPUs if available", func() { + It("setup the node and create pods to test gpus", func() { + By("ensuring that dynamic kubelet configuration is enabled") + enabled, err := isKubeletConfigEnabled(f) + framework.ExpectNoError(err) + if !enabled { + Skip("Dynamic Kubelet configuration is not enabled. Skipping test.") + } + + By("enabling support for GPUs") + var oldCfg *componentconfig.KubeletConfiguration + defer func() { + if oldCfg != nil { + framework.ExpectNoError(setKubeletConfiguration(f, oldCfg)) + } + }() + + oldCfg, err = getCurrentKubeletConfig() + framework.ExpectNoError(err) + clone, err := api.Scheme.DeepCopy(oldCfg) + framework.ExpectNoError(err) + newCfg := clone.(*componentconfig.KubeletConfiguration) + if newCfg.FeatureGates != "" { + newCfg.FeatureGates = fmt.Sprintf("%s,%s", acceleratorsFeatureGate, newCfg.FeatureGates) + } else { + newCfg.FeatureGates = acceleratorsFeatureGate + } + framework.ExpectNoError(setKubeletConfiguration(f, newCfg)) + + By("Getting the local node object from the api server") + nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{}) + framework.ExpectNoError(err, "getting node list") + Expect(len(nodeList.Items)).To(Equal(1)) + node := nodeList.Items[0] + gpusAvailable := node.Status.Capacity.NvidiaGPU() + By("Skipping the test if GPUs aren't available") + if gpusAvailable.IsZero() { + Skip("No GPUs available on local node. Skipping test.") + } + + By("Creating a pod that will consume all GPUs") + podSuccess := makePod(gpusAvailable.Value(), "gpus-success") + podSuccess = f.PodClient().CreateSync(podSuccess) + + By("Checking if the pod outputted Success to its logs") + framework.ExpectNoError(f.PodClient().MatchContainerOutput(podSuccess.Name, podSuccess.Name, "Success")) + + By("Creating a new pod requesting a GPU and noticing that it is rejected by the Kubelet") + podFailure := makePod(1, "gpu-failure") + framework.WaitForPodCondition(f.ClientSet, f.Namespace.Name, podFailure.Name, "pod rejected", framework.PodStartTimeout, func(pod *v1.Pod) (bool, error) { + if pod.Status.Phase == v1.PodFailed { + return true, nil + + } + return false, nil + }) + + By("stopping the original Pod with GPUs") + gp := int64(0) + deleteOptions := metav1.DeleteOptions{ + GracePeriodSeconds: &gp, + } + f.PodClient().DeleteSync(podSuccess.Name, &deleteOptions, 30*time.Second) + + By("attempting to start the failed pod again") + f.PodClient().DeleteSync(podFailure.Name, &deleteOptions, 10*time.Second) + podFailure = f.PodClient().CreateSync(podFailure) + + By("Checking if the pod outputted Success to its logs") + framework.ExpectNoError(f.PodClient().MatchContainerOutput(podFailure.Name, podFailure.Name, "Success")) + }) + }) +}) + +func makePod(gpus int64, name string) *v1.Pod { + resources := v1.ResourceRequirements{ + Limits: v1.ResourceList{ + v1.ResourceNvidiaGPU: *resource.NewQuantity(gpus, resource.DecimalSI), + }, + } + gpuverificationCmd := fmt.Sprintf("if [[ %d -ne $(ls /dev/ | egrep '^nvidia[0-9]+$') ]]; then exit 1; fi; echo Success; sleep 10240 ", gpus) + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Image: "gcr.io/google_containers/busybox:1.24", + Name: name, + Command: []string{"sh", "-c", gpuverificationCmd}, + Resources: resources, + }, + }, + }, + } +}