Merge pull request #42116 from vishh/gpu-experimental-support

Automatic merge from submit-queue

Extend experimental support to multiple Nvidia GPUs

Extended from #28216

```release-note
`--experimental-nvidia-gpus` flag is **replaced** by `Accelerators` alpha feature gate along with  support for multiple Nvidia GPUs. 
To use GPUs, pass `Accelerators=true` as part of `--feature-gates` flag.
Works only with Docker runtime.
```

1. Automated testing for this PR is not possible since creation of clusters with GPUs isn't supported yet in GCP.
1. To test this PR locally, use the node e2e.
```shell
TEST_ARGS='--feature-gates=DynamicKubeletConfig=true' FOCUS=GPU SKIP="" make test-e2e-node
```

TODO:

- [x] Run manual tests
- [x] Add node e2e
- [x] Add unit tests for GPU manager (< 100% coverage)
- [ ] Add unit tests in kubelet package
This commit is contained in:
Kubernetes Submit Queue 2017-03-01 04:52:50 -08:00 committed by GitHub
commit ed479163fa
25 changed files with 866 additions and 103 deletions

View File

@ -206,7 +206,6 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) {
fs.BoolVar(&s.BabysitDaemons, "babysit-daemons", s.BabysitDaemons, "If true, the node has babysitter process monitoring docker and kubelet.") fs.BoolVar(&s.BabysitDaemons, "babysit-daemons", s.BabysitDaemons, "If true, the node has babysitter process monitoring docker and kubelet.")
fs.MarkDeprecated("babysit-daemons", "Will be removed in a future version.") fs.MarkDeprecated("babysit-daemons", "Will be removed in a future version.")
fs.Int32Var(&s.MaxPods, "max-pods", s.MaxPods, "Number of Pods that can run on this Kubelet.") fs.Int32Var(&s.MaxPods, "max-pods", s.MaxPods, "Number of Pods that can run on this Kubelet.")
fs.Int32Var(&s.NvidiaGPUs, "experimental-nvidia-gpus", s.NvidiaGPUs, "Number of NVIDIA GPU devices on this node. Only 0 (default) and 1 are currently supported.")
// TODO(#40229): Remove the docker-exec-handler flag. // TODO(#40229): Remove the docker-exec-handler flag.
fs.StringVar(&s.DockerExecHandlerName, "docker-exec-handler", s.DockerExecHandlerName, "Handler to use when executing a command in a container. Valid values are 'native' and 'nsenter'. Defaults to 'native'.") fs.StringVar(&s.DockerExecHandlerName, "docker-exec-handler", s.DockerExecHandlerName, "Handler to use when executing a command in a container. Valid values are 'native' and 'nsenter'. Defaults to 'native'.")
fs.MarkDeprecated("docker-exec-handler", "this flag will be removed and only the 'native' handler will be supported in the future.") fs.MarkDeprecated("docker-exec-handler", "this flag will be removed and only the 'native' handler will be supported in the future.")

View File

@ -690,3 +690,4 @@ windows-line-endings
www-prefix www-prefix
zone-id zone-id
zone-name zone-name

View File

@ -362,8 +362,6 @@ type KubeletConfiguration struct {
BabysitDaemons bool BabysitDaemons bool
// maxPods is the number of pods that can run on this Kubelet. // maxPods is the number of pods that can run on this Kubelet.
MaxPods int32 MaxPods int32
// nvidiaGPUs is the number of NVIDIA GPU devices on this node.
NvidiaGPUs int32
// dockerExecHandlerName is the handler to use when executing a command // dockerExecHandlerName is the handler to use when executing a command
// in a container. Valid values are 'native' and 'nsenter'. Defaults to // in a container. Valid values are 'native' and 'nsenter'. Defaults to
// 'native'. // 'native'.

View File

@ -407,8 +407,6 @@ type KubeletConfiguration struct {
BabysitDaemons bool `json:"babysitDaemons"` BabysitDaemons bool `json:"babysitDaemons"`
// maxPods is the number of pods that can run on this Kubelet. // maxPods is the number of pods that can run on this Kubelet.
MaxPods int32 `json:"maxPods"` MaxPods int32 `json:"maxPods"`
// nvidiaGPUs is the number of NVIDIA GPU devices on this node.
NvidiaGPUs int32 `json:"nvidiaGPUs"`
// dockerExecHandlerName is the handler to use when executing a command // dockerExecHandlerName is the handler to use when executing a command
// in a container. Valid values are 'native' and 'nsenter'. Defaults to // in a container. Valid values are 'native' and 'nsenter'. Defaults to
// 'native'. // 'native'.

View File

@ -353,7 +353,6 @@ func autoConvert_v1alpha1_KubeletConfiguration_To_componentconfig_KubeletConfigu
out.HairpinMode = in.HairpinMode out.HairpinMode = in.HairpinMode
out.BabysitDaemons = in.BabysitDaemons out.BabysitDaemons = in.BabysitDaemons
out.MaxPods = in.MaxPods out.MaxPods = in.MaxPods
out.NvidiaGPUs = in.NvidiaGPUs
out.DockerExecHandlerName = in.DockerExecHandlerName out.DockerExecHandlerName = in.DockerExecHandlerName
out.PodCIDR = in.PodCIDR out.PodCIDR = in.PodCIDR
out.ResolverConfig = in.ResolverConfig out.ResolverConfig = in.ResolverConfig
@ -531,7 +530,6 @@ func autoConvert_componentconfig_KubeletConfiguration_To_v1alpha1_KubeletConfigu
out.HairpinMode = in.HairpinMode out.HairpinMode = in.HairpinMode
out.BabysitDaemons = in.BabysitDaemons out.BabysitDaemons = in.BabysitDaemons
out.MaxPods = in.MaxPods out.MaxPods = in.MaxPods
out.NvidiaGPUs = in.NvidiaGPUs
out.DockerExecHandlerName = in.DockerExecHandlerName out.DockerExecHandlerName = in.DockerExecHandlerName
out.PodCIDR = in.PodCIDR out.PodCIDR = in.PodCIDR
out.ResolverConfig = in.ResolverConfig out.ResolverConfig = in.ResolverConfig

View File

@ -73,6 +73,14 @@ const (
// Determines if affinity defined in annotations should be processed // Determines if affinity defined in annotations should be processed
// TODO: remove when alpha support for affinity is removed // TODO: remove when alpha support for affinity is removed
AffinityInAnnotations utilfeature.Feature = "AffinityInAnnotations" AffinityInAnnotations utilfeature.Feature = "AffinityInAnnotations"
// owner: @vishh
// alpha: v1.6
//
// Enables support for GPUs as a schedulable resource.
// Only Nvidia GPUs are supported as of v1.6.
// Works only with Docker Container Runtime.
Accelerators utilfeature.Feature = "Accelerators"
) )
func init() { func init() {
@ -90,6 +98,7 @@ var defaultKubernetesFeatureGates = map[utilfeature.Feature]utilfeature.FeatureS
ExperimentalHostUserNamespaceDefaultingGate: {Default: false, PreRelease: utilfeature.Beta}, ExperimentalHostUserNamespaceDefaultingGate: {Default: false, PreRelease: utilfeature.Beta},
ExperimentalCriticalPodAnnotation: {Default: false, PreRelease: utilfeature.Alpha}, ExperimentalCriticalPodAnnotation: {Default: false, PreRelease: utilfeature.Alpha},
AffinityInAnnotations: {Default: false, PreRelease: utilfeature.Alpha}, AffinityInAnnotations: {Default: false, PreRelease: utilfeature.Alpha},
Accelerators: {Default: false, PreRelease: utilfeature.Alpha},
// inherited features from generic apiserver, relisted here to get a conflict if it is changed // inherited features from generic apiserver, relisted here to get a conflict if it is changed
// unintentionally on either side: // unintentionally on either side:

View File

@ -13153,13 +13153,6 @@ func GetOpenAPIDefinitions(ref openapi.ReferenceCallback) map[string]openapi.Ope
Format: "int32", Format: "int32",
}, },
}, },
"nvidiaGPUs": {
SchemaProps: spec.SchemaProps{
Description: "nvidiaGPUs is the number of NVIDIA GPU devices on this node.",
Type: []string{"integer"},
Format: "int32",
},
},
"dockerExecHandlerName": { "dockerExecHandlerName": {
SchemaProps: spec.SchemaProps{ SchemaProps: spec.SchemaProps{
Description: "dockerExecHandlerName is the handler to use when executing a command in a container. Valid values are 'native' and 'nsenter'. Defaults to 'native'.", Description: "dockerExecHandlerName is the handler to use when executing a command in a container. Valid values are 'native' and 'nsenter'. Defaults to 'native'.",
@ -13494,7 +13487,7 @@ func GetOpenAPIDefinitions(ref openapi.ReferenceCallback) map[string]openapi.Ope
}, },
}, },
}, },
Required: []string{"podManifestPath", "syncFrequency", "fileCheckFrequency", "httpCheckFrequency", "manifestURL", "manifestURLHeader", "enableServer", "address", "port", "readOnlyPort", "tlsCertFile", "tlsPrivateKeyFile", "certDirectory", "authentication", "authorization", "hostnameOverride", "podInfraContainerImage", "dockerEndpoint", "rootDirectory", "seccompProfileRoot", "allowPrivileged", "hostNetworkSources", "hostPIDSources", "hostIPCSources", "registryPullQPS", "registryBurst", "eventRecordQPS", "eventBurst", "enableDebuggingHandlers", "minimumGCAge", "maxPerPodContainerCount", "maxContainerCount", "cAdvisorPort", "healthzPort", "healthzBindAddress", "oomScoreAdj", "registerNode", "clusterDomain", "masterServiceNamespace", "clusterDNS", "streamingConnectionIdleTimeout", "nodeStatusUpdateFrequency", "imageMinimumGCAge", "imageGCHighThresholdPercent", "imageGCLowThresholdPercent", "lowDiskSpaceThresholdMB", "volumeStatsAggPeriod", "networkPluginName", "networkPluginDir", "cniConfDir", "cniBinDir", "networkPluginMTU", "volumePluginDir", "cloudProvider", "cloudConfigFile", "kubeletCgroups", "runtimeCgroups", "systemCgroups", "cgroupRoot", "containerRuntime", "remoteRuntimeEndpoint", "remoteImageEndpoint", "runtimeRequestTimeout", "rktPath", "rktAPIEndpoint", "rktStage1Image", "lockFilePath", "exitOnLockContention", "hairpinMode", "babysitDaemons", "maxPods", "nvidiaGPUs", "dockerExecHandlerName", "podCIDR", "resolvConf", "cpuCFSQuota", "containerized", "maxOpenFiles", "registerSchedulable", "registerWithTaints", "contentType", "kubeAPIQPS", "kubeAPIBurst", "serializeImagePulls", "outOfDiskTransitionFrequency", "nodeIP", "nodeLabels", "nonMasqueradeCIDR", "enableCustomMetrics", "evictionHard", "evictionSoft", "evictionSoftGracePeriod", "evictionPressureTransitionPeriod", "evictionMaxPodGracePeriod", "evictionMinimumReclaim", "experimentalKernelMemcgNotification", "podsPerCore", "enableControllerAttachDetach", "protectKernelDefaults", "makeIPTablesUtilChains", "iptablesMasqueradeBit", "iptablesDropBit", "systemReserved", "kubeReserved"}, Required: []string{"podManifestPath", "syncFrequency", "fileCheckFrequency", "httpCheckFrequency", "manifestURL", "manifestURLHeader", "enableServer", "address", "port", "readOnlyPort", "tlsCertFile", "tlsPrivateKeyFile", "certDirectory", "authentication", "authorization", "hostnameOverride", "podInfraContainerImage", "dockerEndpoint", "rootDirectory", "seccompProfileRoot", "allowPrivileged", "hostNetworkSources", "hostPIDSources", "hostIPCSources", "registryPullQPS", "registryBurst", "eventRecordQPS", "eventBurst", "enableDebuggingHandlers", "minimumGCAge", "maxPerPodContainerCount", "maxContainerCount", "cAdvisorPort", "healthzPort", "healthzBindAddress", "oomScoreAdj", "registerNode", "clusterDomain", "masterServiceNamespace", "clusterDNS", "streamingConnectionIdleTimeout", "nodeStatusUpdateFrequency", "imageMinimumGCAge", "imageGCHighThresholdPercent", "imageGCLowThresholdPercent", "lowDiskSpaceThresholdMB", "volumeStatsAggPeriod", "networkPluginName", "networkPluginDir", "cniConfDir", "cniBinDir", "networkPluginMTU", "volumePluginDir", "cloudProvider", "cloudConfigFile", "kubeletCgroups", "runtimeCgroups", "systemCgroups", "cgroupRoot", "containerRuntime", "remoteRuntimeEndpoint", "remoteImageEndpoint", "runtimeRequestTimeout", "rktPath", "rktAPIEndpoint", "rktStage1Image", "lockFilePath", "exitOnLockContention", "hairpinMode", "babysitDaemons", "maxPods", "dockerExecHandlerName", "podCIDR", "resolvConf", "cpuCFSQuota", "containerized", "maxOpenFiles", "registerSchedulable", "registerWithTaints", "contentType", "kubeAPIQPS", "kubeAPIBurst", "serializeImagePulls", "outOfDiskTransitionFrequency", "nodeIP", "nodeLabels", "nonMasqueradeCIDR", "enableCustomMetrics", "evictionHard", "evictionSoft", "evictionSoftGracePeriod", "evictionPressureTransitionPeriod", "evictionMaxPodGracePeriod", "evictionMinimumReclaim", "experimentalKernelMemcgNotification", "podsPerCore", "enableControllerAttachDetach", "protectKernelDefaults", "makeIPTablesUtilChains", "iptablesMasqueradeBit", "iptablesDropBit", "systemReserved", "kubeReserved"},
}, },
}, },
Dependencies: []string{ Dependencies: []string{

View File

@ -58,6 +58,8 @@ go_library(
"//pkg/kubelet/envvars:go_default_library", "//pkg/kubelet/envvars:go_default_library",
"//pkg/kubelet/events:go_default_library", "//pkg/kubelet/events:go_default_library",
"//pkg/kubelet/eviction:go_default_library", "//pkg/kubelet/eviction:go_default_library",
"//pkg/kubelet/gpu:go_default_library",
"//pkg/kubelet/gpu/nvidia:go_default_library",
"//pkg/kubelet/images:go_default_library", "//pkg/kubelet/images:go_default_library",
"//pkg/kubelet/kuberuntime:go_default_library", "//pkg/kubelet/kuberuntime:go_default_library",
"//pkg/kubelet/lifecycle:go_default_library", "//pkg/kubelet/lifecycle:go_default_library",
@ -169,6 +171,7 @@ go_test(
"//pkg/kubelet/container:go_default_library", "//pkg/kubelet/container:go_default_library",
"//pkg/kubelet/container/testing:go_default_library", "//pkg/kubelet/container/testing:go_default_library",
"//pkg/kubelet/eviction:go_default_library", "//pkg/kubelet/eviction:go_default_library",
"//pkg/kubelet/gpu:go_default_library",
"//pkg/kubelet/images:go_default_library", "//pkg/kubelet/images:go_default_library",
"//pkg/kubelet/lifecycle:go_default_library", "//pkg/kubelet/lifecycle:go_default_library",
"//pkg/kubelet/network:go_default_library", "//pkg/kubelet/network:go_default_library",
@ -246,6 +249,7 @@ filegroup(
"//pkg/kubelet/envvars:all-srcs", "//pkg/kubelet/envvars:all-srcs",
"//pkg/kubelet/events:all-srcs", "//pkg/kubelet/events:all-srcs",
"//pkg/kubelet/eviction:all-srcs", "//pkg/kubelet/eviction:all-srcs",
"//pkg/kubelet/gpu:all-srcs",
"//pkg/kubelet/images:all-srcs", "//pkg/kubelet/images:all-srcs",
"//pkg/kubelet/kuberuntime:all-srcs", "//pkg/kubelet/kuberuntime:all-srcs",
"//pkg/kubelet/leaky:all-srcs", "//pkg/kubelet/leaky:all-srcs",

34
pkg/kubelet/gpu/BUILD Normal file
View File

@ -0,0 +1,34 @@
package(default_visibility = ["//visibility:public"])
licenses(["notice"])
load(
"@io_bazel_rules_go//go:def.bzl",
"go_library",
)
go_library(
name = "go_default_library",
srcs = [
"gpu_manager_stub.go",
"types.go",
],
tags = ["automanaged"],
deps = ["//pkg/api/v1:go_default_library"],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [
":package-srcs",
"//pkg/kubelet/gpu/nvidia:all-srcs",
],
tags = ["automanaged"],
)

View File

@ -0,0 +1,41 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package gpu
import (
"fmt"
"k8s.io/kubernetes/pkg/api/v1"
)
type gpuManagerStub struct{}
func (gms *gpuManagerStub) Start() error {
return nil
}
func (gms *gpuManagerStub) Capacity() v1.ResourceList {
return nil
}
func (gms *gpuManagerStub) AllocateGPU(_ *v1.Pod, _ *v1.Container) ([]string, error) {
return nil, fmt.Errorf("GPUs are not supported")
}
func NewGPUManagerStub() GPUManager {
return &gpuManagerStub{}
}

View File

@ -0,0 +1,54 @@
package(default_visibility = ["//visibility:public"])
licenses(["notice"])
load(
"@io_bazel_rules_go//go:def.bzl",
"go_library",
"go_test",
)
go_library(
name = "go_default_library",
srcs = [
"helpers.go",
"nvidia_gpu_manager.go",
],
tags = ["automanaged"],
deps = [
"//pkg/api/v1:go_default_library",
"//pkg/kubelet/dockertools:go_default_library",
"//pkg/kubelet/gpu:go_default_library",
"//vendor:github.com/golang/glog",
"//vendor:k8s.io/apimachinery/pkg/api/resource",
"//vendor:k8s.io/apimachinery/pkg/util/sets",
],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
tags = ["automanaged"],
)
go_test(
name = "go_default_test",
srcs = ["nvidia_gpu_manager_test.go"],
library = ":go_default_library",
tags = ["automanaged"],
deps = [
"//pkg/api/v1:go_default_library",
"//vendor:github.com/stretchr/testify/assert",
"//vendor:k8s.io/apimachinery/pkg/api/resource",
"//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
"//vendor:k8s.io/apimachinery/pkg/util/sets",
"//vendor:k8s.io/apimachinery/pkg/util/uuid",
],
)

View File

@ -0,0 +1,59 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nvidia
import "k8s.io/apimachinery/pkg/util/sets"
// podGPUs represents a list of pod to GPU mappings.
type podGPUs struct {
podGPUMapping map[string]sets.String
}
func newPodGPUs() *podGPUs {
return &podGPUs{
podGPUMapping: map[string]sets.String{},
}
}
func (pgpu *podGPUs) pods() sets.String {
ret := sets.NewString()
for k := range pgpu.podGPUMapping {
ret.Insert(k)
}
return ret
}
func (pgpu *podGPUs) insert(podUID string, device string) {
if _, exists := pgpu.podGPUMapping[podUID]; !exists {
pgpu.podGPUMapping[podUID] = sets.NewString(device)
} else {
pgpu.podGPUMapping[podUID].Insert(device)
}
}
func (pgpu *podGPUs) delete(pods []string) {
for _, uid := range pods {
delete(pgpu.podGPUMapping, uid)
}
}
func (pgpu *podGPUs) devices() sets.String {
ret := sets.NewString()
for _, devices := range pgpu.podGPUMapping {
ret = ret.Union(devices)
}
return ret
}

View File

@ -0,0 +1,279 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nvidia
import (
"fmt"
"io/ioutil"
"os"
"path"
"regexp"
"sync"
"github.com/golang/glog"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/kubernetes/pkg/api/v1"
"k8s.io/kubernetes/pkg/kubelet/dockertools"
"k8s.io/kubernetes/pkg/kubelet/gpu"
)
// TODO: rework to use Nvidia's NVML, which is more complex, but also provides more fine-grained information and stats.
const (
// All NVIDIA GPUs cards should be mounted with nvidiactl and nvidia-uvm
// If the driver installed correctly, the 2 devices will be there.
nvidiaCtlDevice string = "/dev/nvidiactl"
nvidiaUVMDevice string = "/dev/nvidia-uvm"
// Optional device.
nvidiaUVMToolsDevice string = "/dev/nvidia-uvm-tools"
devDirectory = "/dev"
nvidiaDeviceRE = `^nvidia[0-9]*$`
nvidiaFullpathRE = `^/dev/nvidia[0-9]*$`
)
type activePodsLister interface {
// Returns a list of active pods on the node.
GetRunningPods() ([]*v1.Pod, error)
}
// nvidiaGPUManager manages nvidia gpu devices.
type nvidiaGPUManager struct {
sync.Mutex
// All gpus available on the Node
allGPUs sets.String
allocated *podGPUs
defaultDevices []string
// The interface which could get GPU mapping from all the containers.
// TODO: Should make this independent of Docker in the future.
dockerClient dockertools.DockerInterface
activePodsLister activePodsLister
}
// NewNvidiaGPUManager returns a GPUManager that manages local Nvidia GPUs.
// TODO: Migrate to use pod level cgroups and make it generic to all runtimes.
func NewNvidiaGPUManager(activePodsLister activePodsLister, dockerClient dockertools.DockerInterface) (gpu.GPUManager, error) {
if dockerClient == nil {
return nil, fmt.Errorf("invalid docker client specified")
}
return &nvidiaGPUManager{
allGPUs: sets.NewString(),
dockerClient: dockerClient,
activePodsLister: activePodsLister,
}, nil
}
// Initialize the GPU devices, so far only needed to discover the GPU paths.
func (ngm *nvidiaGPUManager) Start() error {
if ngm.dockerClient == nil {
return fmt.Errorf("Invalid docker client specified in GPU Manager")
}
ngm.Lock()
defer ngm.Unlock()
if _, err := os.Stat(nvidiaCtlDevice); err != nil {
return err
}
if _, err := os.Stat(nvidiaUVMDevice); err != nil {
return err
}
ngm.defaultDevices = []string{nvidiaCtlDevice, nvidiaUVMDevice}
_, err := os.Stat(nvidiaUVMToolsDevice)
if !os.IsNotExist(err) {
ngm.defaultDevices = append(ngm.defaultDevices, nvidiaUVMToolsDevice)
}
if err := ngm.discoverGPUs(); err != nil {
return err
}
// It's possible that the runtime isn't available now.
allocatedGPUs, err := ngm.gpusInUse()
if err == nil {
ngm.allocated = allocatedGPUs
}
// We ignore errors when identifying allocated GPUs because it is possible that the runtime interfaces may be not be logically up.
return nil
}
// Get how many GPU cards we have.
func (ngm *nvidiaGPUManager) Capacity() v1.ResourceList {
gpus := resource.NewQuantity(int64(len(ngm.allGPUs)), resource.DecimalSI)
return v1.ResourceList{
v1.ResourceNvidiaGPU: *gpus,
}
}
// AllocateGPUs returns `num` GPUs if available, error otherwise.
// Allocation is made thread safe using the following logic.
// A list of all GPUs allocated is maintained along with their respective Pod UIDs.
// It is expected that the list of active pods will not return any false positives.
// As part of initialization or allocation, the list of GPUs in use will be computed once.
// Whenever an allocation happens, the list of GPUs allocated is updated based on the list of currently active pods.
// GPUs allocated to terminated pods are freed up lazily as part of allocation.
// GPUs are allocated based on the internal list of allocatedGPUs.
// It is not safe to generate a list of GPUs in use by inspecting active containers because of the delay between GPU allocation and container creation.
// A GPU allocated to a container might be re-allocated to a subsequent container because the original container wasn't started quick enough.
// The current algorithm scans containers only once and then uses a list of active pods to track GPU usage.
// This is a sub-optimal solution and a better alternative would be that of using pod level cgroups instead.
// GPUs allocated to containers should be reflected in pod level device cgroups before completing allocations.
// The pod level cgroups will then serve as a checkpoint of GPUs in use.
func (ngm *nvidiaGPUManager) AllocateGPU(pod *v1.Pod, container *v1.Container) ([]string, error) {
gpusNeeded := container.Resources.Limits.NvidiaGPU().Value()
if gpusNeeded == 0 {
return []string{}, nil
}
ngm.Lock()
defer ngm.Unlock()
if ngm.allocated == nil {
// Initialization is not complete. Try now. Failures can no longer be tolerated.
allocated, err := ngm.gpusInUse()
if err != nil {
return nil, fmt.Errorf("Failed to allocate GPUs because of issues identifying GPUs in use: %v", err)
}
ngm.allocated = allocated
} else {
// update internal list of GPUs in use prior to allocating new GPUs.
if err := ngm.updateAllocatedGPUs(); err != nil {
return nil, fmt.Errorf("Failed to allocate GPUs because of issues with updating GPUs in use: %v", err)
}
}
// Get GPU devices in use.
devicesInUse := ngm.allocated.devices()
glog.V(5).Infof("gpus in use: %v", devicesInUse.List())
// Get a list of available GPUs.
available := ngm.allGPUs.Difference(devicesInUse)
glog.V(5).Infof("gpus available: %v", available.List())
if int64(available.Len()) < gpusNeeded {
return nil, fmt.Errorf("requested number of GPUs unavailable. Requested: %d, Available: %d", gpusNeeded, available.Len())
}
ret := available.UnsortedList()[:gpusNeeded]
for _, device := range ret {
// Update internal allocated GPU cache.
ngm.allocated.insert(string(pod.UID), device)
}
// Add standard devices files that needs to be exposed.
ret = append(ret, ngm.defaultDevices...)
return ret, nil
}
// updateAllocatedGPUs updates the list of GPUs in use.
// It gets a list of running pods and then frees any GPUs that are bound to terminated pods.
// Returns error on failure.
func (ngm *nvidiaGPUManager) updateAllocatedGPUs() error {
activePods, err := ngm.activePodsLister.GetRunningPods()
if err != nil {
return fmt.Errorf("Failed to list active pods: %v", err)
}
activePodUids := sets.NewString()
for _, pod := range activePods {
activePodUids.Insert(string(pod.UID))
}
allocatedPodUids := ngm.allocated.pods()
podsToBeRemoved := allocatedPodUids.Difference(activePodUids)
glog.V(5).Infof("pods to be removed: %v", podsToBeRemoved.List())
ngm.allocated.delete(podsToBeRemoved.List())
return nil
}
// discoverGPUs identifies allGPUs NVIDIA GPU devices available on the local node by walking `/dev` directory.
// TODO: Without NVML support we only can check whether there has GPU devices, but
// could not give a health check or get more information like GPU cores, memory, or
// family name. Need to support NVML in the future. But we do not need NVML until
// we want more features, features like schedule containers according to GPU family
// name.
func (ngm *nvidiaGPUManager) discoverGPUs() error {
reg := regexp.MustCompile(nvidiaDeviceRE)
files, err := ioutil.ReadDir(devDirectory)
if err != nil {
return err
}
for _, f := range files {
if f.IsDir() {
continue
}
if reg.MatchString(f.Name()) {
glog.V(2).Infof("Found Nvidia GPU %q", f.Name())
ngm.allGPUs.Insert(path.Join(devDirectory, f.Name()))
}
}
return nil
}
// gpusInUse returns a list of GPUs in use along with the respective pods that are using it.
func (ngm *nvidiaGPUManager) gpusInUse() (*podGPUs, error) {
pods, err := ngm.activePodsLister.GetRunningPods()
if err != nil {
return nil, err
}
type podContainers struct {
uid string
containerIDs sets.String
}
// List of containers to inspect.
podContainersToInspect := []podContainers{}
for _, pod := range pods {
containers := sets.NewString()
for _, container := range pod.Spec.Containers {
// GPUs are expected to be specified only in limits.
if !container.Resources.Limits.NvidiaGPU().IsZero() {
containers.Insert(container.Name)
}
}
// If no GPUs were requested skip this pod.
if containers.Len() == 0 {
continue
}
containerIDs := sets.NewString()
for _, container := range pod.Status.ContainerStatuses {
if containers.Has(container.Name) {
containerIDs.Insert(container.ContainerID)
}
}
// add the pod and its containers that need to be inspected.
podContainersToInspect = append(podContainersToInspect, podContainers{string(pod.UID), containerIDs})
}
ret := newPodGPUs()
for _, podContainer := range podContainersToInspect {
for _, containerId := range podContainer.containerIDs.List() {
containerJSON, err := ngm.dockerClient.InspectContainer(containerId)
if err != nil {
glog.V(3).Infof("Failed to inspect container %q in pod %q while attempting to reconcile nvidia gpus in use", containerId, podContainer.uid)
continue
}
devices := containerJSON.HostConfig.Devices
if devices == nil {
continue
}
for _, device := range devices {
if isValidPath(device.PathOnHost) {
glog.V(4).Infof("Nvidia GPU %q is in use by Docker Container: %q", device.PathOnHost, containerJSON.ID)
ret.insert(podContainer.uid, device.PathOnHost)
}
}
}
}
return ret, nil
}
func isValidPath(path string) bool {
return regexp.MustCompile(nvidiaFullpathRE).MatchString(path)
}

View File

@ -0,0 +1,144 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nvidia
import (
"testing"
"github.com/stretchr/testify/assert"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/uuid"
"k8s.io/kubernetes/pkg/api/v1"
)
type testActivePodsLister struct {
activePods []*v1.Pod
}
func (tapl *testActivePodsLister) GetRunningPods() ([]*v1.Pod, error) {
return tapl.activePods, nil
}
func makeTestPod(numContainers int) *v1.Pod {
quantity := resource.NewQuantity(1, resource.DecimalSI)
resources := v1.ResourceRequirements{
Limits: v1.ResourceList{
v1.ResourceNvidiaGPU: *quantity,
},
}
pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
UID: uuid.NewUUID(),
},
Spec: v1.PodSpec{
Containers: []v1.Container{},
},
}
for ; numContainers > 0; numContainers-- {
pod.Spec.Containers = append(pod.Spec.Containers, v1.Container{
Resources: resources,
})
}
return pod
}
func TestMultiContainerPodGPUAllocation(t *testing.T) {
podLister := &testActivePodsLister{}
testGpuManager := &nvidiaGPUManager{
activePodsLister: podLister,
allGPUs: sets.NewString("/dev/nvidia0", "/dev/nvidia1"),
allocated: newPodGPUs(),
}
// Expect that no devices are in use.
gpusInUse, err := testGpuManager.gpusInUse()
as := assert.New(t)
as.Nil(err)
as.Equal(len(gpusInUse.devices()), 0)
// Allocated GPUs for a pod with two containers.
pod := makeTestPod(2)
// Allocate for the first container.
devices1, err := testGpuManager.AllocateGPU(pod, &pod.Spec.Containers[0])
as.Nil(err)
as.Equal(len(devices1), 1)
podLister.activePods = append(podLister.activePods, pod)
// Allocate for the second container.
devices2, err := testGpuManager.AllocateGPU(pod, &pod.Spec.Containers[1])
as.Nil(err)
as.Equal(len(devices2), 1)
as.NotEqual(devices1, devices2, "expected containers to get different devices")
// further allocations should fail.
newPod := makeTestPod(2)
devices1, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[0])
as.NotNil(err, "expected gpu allocation to fail. got: %v", devices1)
// Now terminate the original pod and observe that GPU allocation for new pod succeeds.
podLister.activePods = podLister.activePods[:0]
devices1, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[0])
as.Nil(err)
as.Equal(len(devices1), 1)
podLister.activePods = append(podLister.activePods, newPod)
devices2, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[1])
as.Nil(err)
as.Equal(len(devices2), 1)
as.NotEqual(devices1, devices2, "expected containers to get different devices")
}
func TestMultiPodGPUAllocation(t *testing.T) {
podLister := &testActivePodsLister{}
testGpuManager := &nvidiaGPUManager{
activePodsLister: podLister,
allGPUs: sets.NewString("/dev/nvidia0", "/dev/nvidia1"),
allocated: newPodGPUs(),
}
// Expect that no devices are in use.
gpusInUse, err := testGpuManager.gpusInUse()
as := assert.New(t)
as.Nil(err)
as.Equal(len(gpusInUse.devices()), 0)
// Allocated GPUs for a pod with two containers.
podA := makeTestPod(1)
// Allocate for the first container.
devicesA, err := testGpuManager.AllocateGPU(podA, &podA.Spec.Containers[0])
as.Nil(err)
as.Equal(len(devicesA), 1)
podLister.activePods = append(podLister.activePods, podA)
// further allocations should fail.
podB := makeTestPod(1)
// Allocate for the first container.
devicesB, err := testGpuManager.AllocateGPU(podB, &podB.Spec.Containers[0])
as.Nil(err)
as.Equal(len(devicesB), 1)
as.NotEqual(devicesA, devicesB, "expected pods to get different devices")
}

32
pkg/kubelet/gpu/types.go Normal file
View File

@ -0,0 +1,32 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package gpu
import "k8s.io/kubernetes/pkg/api/v1"
// GPUManager manages GPUs on a local node.
// Implementations are expected to be thread safe.
type GPUManager interface {
// Start logically initializes GPUManager
Start() error
// Capacity returns the total number of GPUs on the node.
Capacity() v1.ResourceList
// AllocateGPU attempts to allocate GPUs for input container.
// Returns paths to allocated GPUs and nil on success.
// Returns an error on failure.
AllocateGPU(*v1.Pod, *v1.Container) ([]string, error)
}

View File

@ -67,6 +67,8 @@ import (
"k8s.io/kubernetes/pkg/kubelet/dockertools" "k8s.io/kubernetes/pkg/kubelet/dockertools"
"k8s.io/kubernetes/pkg/kubelet/events" "k8s.io/kubernetes/pkg/kubelet/events"
"k8s.io/kubernetes/pkg/kubelet/eviction" "k8s.io/kubernetes/pkg/kubelet/eviction"
"k8s.io/kubernetes/pkg/kubelet/gpu"
"k8s.io/kubernetes/pkg/kubelet/gpu/nvidia"
"k8s.io/kubernetes/pkg/kubelet/images" "k8s.io/kubernetes/pkg/kubelet/images"
"k8s.io/kubernetes/pkg/kubelet/kuberuntime" "k8s.io/kubernetes/pkg/kubelet/kuberuntime"
"k8s.io/kubernetes/pkg/kubelet/lifecycle" "k8s.io/kubernetes/pkg/kubelet/lifecycle"
@ -450,7 +452,6 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub
nonMasqueradeCIDR: kubeCfg.NonMasqueradeCIDR, nonMasqueradeCIDR: kubeCfg.NonMasqueradeCIDR,
maxPods: int(kubeCfg.MaxPods), maxPods: int(kubeCfg.MaxPods),
podsPerCore: int(kubeCfg.PodsPerCore), podsPerCore: int(kubeCfg.PodsPerCore),
nvidiaGPUs: int(kubeCfg.NvidiaGPUs),
syncLoopMonitor: atomic.Value{}, syncLoopMonitor: atomic.Value{},
resolverConfig: kubeCfg.ResolverConfig, resolverConfig: kubeCfg.ResolverConfig,
cpuCFSQuota: kubeCfg.CPUCFSQuota, cpuCFSQuota: kubeCfg.CPUCFSQuota,
@ -786,7 +787,16 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub
klet.appArmorValidator = apparmor.NewValidator(kubeCfg.ContainerRuntime) klet.appArmorValidator = apparmor.NewValidator(kubeCfg.ContainerRuntime)
klet.softAdmitHandlers.AddPodAdmitHandler(lifecycle.NewAppArmorAdmitHandler(klet.appArmorValidator)) klet.softAdmitHandlers.AddPodAdmitHandler(lifecycle.NewAppArmorAdmitHandler(klet.appArmorValidator))
if utilfeature.DefaultFeatureGate.Enabled(features.Accelerators) {
if kubeCfg.ContainerRuntime != "docker" {
return nil, fmt.Errorf("Accelerators feature is supported with docker runtime only.")
}
if klet.gpuManager, err = nvidia.NewNvidiaGPUManager(klet, klet.dockerClient); err != nil {
return nil, err
}
} else {
klet.gpuManager = gpu.NewGPUManagerStub()
}
// Finally, put the most recent version of the config on the Kubelet, so // Finally, put the most recent version of the config on the Kubelet, so
// people can see how it was configured. // people can see how it was configured.
klet.kubeletConfiguration = *kubeCfg klet.kubeletConfiguration = *kubeCfg
@ -981,9 +991,6 @@ type Kubelet struct {
// Maximum Number of Pods which can be run by this Kubelet // Maximum Number of Pods which can be run by this Kubelet
maxPods int maxPods int
// Number of NVIDIA GPUs on this node
nvidiaGPUs int
// Monitor Kubelet's sync loop // Monitor Kubelet's sync loop
syncLoopMonitor atomic.Value syncLoopMonitor atomic.Value
@ -1089,6 +1096,9 @@ type Kubelet struct {
// This should only be enabled when the container runtime is performing user remapping AND if the // This should only be enabled when the container runtime is performing user remapping AND if the
// experimental behavior is desired. // experimental behavior is desired.
experimentalHostUserNamespaceDefaulting bool experimentalHostUserNamespaceDefaulting bool
// GPU Manager
gpuManager gpu.GPUManager
} }
// setupDataDirs creates: // setupDataDirs creates:
@ -1182,7 +1192,10 @@ func (kl *Kubelet) initializeModules() error {
return fmt.Errorf("Failed to start OOM watcher %v", err) return fmt.Errorf("Failed to start OOM watcher %v", err)
} }
// Step 7: Start resource analyzer // Step 7: Initialize GPUs
kl.gpuManager.Start()
// Step 8: Start resource analyzer
kl.resourceAnalyzer.Start() kl.resourceAnalyzer.Start()
return nil return nil

View File

@ -482,6 +482,14 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
node.Status.Capacity = v1.ResourceList{} node.Status.Capacity = v1.ResourceList{}
} }
// populate GPU capacity.
gpuCapacity := kl.gpuManager.Capacity()
if gpuCapacity != nil {
for k, v := range gpuCapacity {
node.Status.Capacity[k] = v
}
}
// TODO: Post NotReady if we cannot get MachineInfo from cAdvisor. This needs to start // TODO: Post NotReady if we cannot get MachineInfo from cAdvisor. This needs to start
// cAdvisor locally, e.g. for test-cmd.sh, and in integration test. // cAdvisor locally, e.g. for test-cmd.sh, and in integration test.
info, err := kl.GetCachedMachineInfo() info, err := kl.GetCachedMachineInfo()
@ -491,8 +499,6 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
node.Status.Capacity[v1.ResourceCPU] = *resource.NewMilliQuantity(0, resource.DecimalSI) node.Status.Capacity[v1.ResourceCPU] = *resource.NewMilliQuantity(0, resource.DecimalSI)
node.Status.Capacity[v1.ResourceMemory] = resource.MustParse("0Gi") node.Status.Capacity[v1.ResourceMemory] = resource.MustParse("0Gi")
node.Status.Capacity[v1.ResourcePods] = *resource.NewQuantity(int64(kl.maxPods), resource.DecimalSI) node.Status.Capacity[v1.ResourcePods] = *resource.NewQuantity(int64(kl.maxPods), resource.DecimalSI)
node.Status.Capacity[v1.ResourceNvidiaGPU] = *resource.NewQuantity(int64(kl.nvidiaGPUs), resource.DecimalSI)
glog.Errorf("Error getting machine info: %v", err) glog.Errorf("Error getting machine info: %v", err)
} else { } else {
node.Status.NodeInfo.MachineID = info.MachineID node.Status.NodeInfo.MachineID = info.MachineID
@ -509,8 +515,6 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
node.Status.Capacity[v1.ResourcePods] = *resource.NewQuantity( node.Status.Capacity[v1.ResourcePods] = *resource.NewQuantity(
int64(kl.maxPods), resource.DecimalSI) int64(kl.maxPods), resource.DecimalSI)
} }
node.Status.Capacity[v1.ResourceNvidiaGPU] = *resource.NewQuantity(
int64(kl.nvidiaGPUs), resource.DecimalSI)
if node.Status.NodeInfo.BootID != "" && if node.Status.NodeInfo.BootID != "" &&
node.Status.NodeInfo.BootID != info.BootID { node.Status.NodeInfo.BootID != info.BootID {
// TODO: This requires a transaction, either both node status is updated // TODO: This requires a transaction, either both node status is updated

View File

@ -211,13 +211,11 @@ func TestUpdateNewNodeStatus(t *testing.T) {
v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI),
v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI), v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI),
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
}, },
Allocatable: v1.ResourceList{ Allocatable: v1.ResourceList{
v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI), v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI),
v1.ResourceMemory: *resource.NewQuantity(9900E6, resource.BinarySI), v1.ResourceMemory: *resource.NewQuantity(9900E6, resource.BinarySI),
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
}, },
Addresses: []v1.NodeAddress{ Addresses: []v1.NodeAddress{
{Type: v1.NodeLegacyHostIP, Address: "127.0.0.1"}, {Type: v1.NodeLegacyHostIP, Address: "127.0.0.1"},
@ -405,7 +403,6 @@ func TestUpdateExistingNodeStatus(t *testing.T) {
v1.ResourceCPU: *resource.NewMilliQuantity(2800, resource.DecimalSI), v1.ResourceCPU: *resource.NewMilliQuantity(2800, resource.DecimalSI),
v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI), v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI),
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
}, },
}, },
} }
@ -485,13 +482,11 @@ func TestUpdateExistingNodeStatus(t *testing.T) {
v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI),
v1.ResourceMemory: *resource.NewQuantity(20E9, resource.BinarySI), v1.ResourceMemory: *resource.NewQuantity(20E9, resource.BinarySI),
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
}, },
Allocatable: v1.ResourceList{ Allocatable: v1.ResourceList{
v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI), v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI),
v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI), v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI),
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
}, },
Addresses: []v1.NodeAddress{ Addresses: []v1.NodeAddress{
{Type: v1.NodeLegacyHostIP, Address: "127.0.0.1"}, {Type: v1.NodeLegacyHostIP, Address: "127.0.0.1"},
@ -793,13 +788,11 @@ func TestUpdateNodeStatusWithRuntimeStateError(t *testing.T) {
v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI),
v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI), v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI),
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
}, },
Allocatable: v1.ResourceList{ Allocatable: v1.ResourceList{
v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI), v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI),
v1.ResourceMemory: *resource.NewQuantity(9900E6, resource.BinarySI), v1.ResourceMemory: *resource.NewQuantity(9900E6, resource.BinarySI),
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
}, },
Addresses: []v1.NodeAddress{ Addresses: []v1.NodeAddress{
{Type: v1.NodeLegacyHostIP, Address: "127.0.0.1"}, {Type: v1.NodeLegacyHostIP, Address: "127.0.0.1"},

View File

@ -84,20 +84,23 @@ func (kl *Kubelet) getActivePods() []*v1.Pod {
} }
// makeDevices determines the devices for the given container. // makeDevices determines the devices for the given container.
// Experimental. For now, we hardcode /dev/nvidia0 no matter what the user asks for // Experimental.
// (we only support one device per node). func (kl *Kubelet) makeDevices(pod *v1.Pod, container *v1.Container) ([]kubecontainer.DeviceInfo, error) {
// TODO: add support for more than 1 GPU after #28216. if container.Resources.Limits.NvidiaGPU().IsZero() {
func makeDevices(container *v1.Container) []kubecontainer.DeviceInfo { return nil, nil
nvidiaGPULimit := container.Resources.Limits.NvidiaGPU()
if nvidiaGPULimit.Value() != 0 {
return []kubecontainer.DeviceInfo{
{PathOnHost: "/dev/nvidia0", PathInContainer: "/dev/nvidia0", Permissions: "mrw"},
{PathOnHost: "/dev/nvidiactl", PathInContainer: "/dev/nvidiactl", Permissions: "mrw"},
{PathOnHost: "/dev/nvidia-uvm", PathInContainer: "/dev/nvidia-uvm", Permissions: "mrw"},
}
} }
return nil nvidiaGPUPaths, err := kl.gpuManager.AllocateGPU(pod, container)
if err != nil {
return nil, err
}
var devices []kubecontainer.DeviceInfo
for _, path := range nvidiaGPUPaths {
// Devices have to be mapped one to one because of nvidia CUDA library requirements.
devices = append(devices, kubecontainer.DeviceInfo{PathOnHost: path, PathInContainer: path, Permissions: "mrw"})
}
return devices, nil
} }
// makeMounts determines the mount points for the given container. // makeMounts determines the mount points for the given container.
@ -285,7 +288,10 @@ func (kl *Kubelet) GenerateRunContainerOptions(pod *v1.Pod, container *v1.Contai
opts.PortMappings = kubecontainer.MakePortMappings(container) opts.PortMappings = kubecontainer.MakePortMappings(container)
// TODO(random-liu): Move following convert functions into pkg/kubelet/container // TODO(random-liu): Move following convert functions into pkg/kubelet/container
opts.Devices = makeDevices(container) opts.Devices, err = kl.makeDevices(pod, container)
if err != nil {
return nil, err
}
opts.Mounts, err = makeMounts(pod, kl.getPodDir(pod.UID), container, hostname, hostDomainName, podIP, volumes) opts.Mounts, err = makeMounts(pod, kl.getPodDir(pod.UID), container, hostname, hostDomainName, podIP, volumes)
if err != nil { if err != nil {

View File

@ -27,7 +27,6 @@ import (
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
apierrors "k8s.io/apimachinery/pkg/api/errors" apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime"
@ -1711,39 +1710,6 @@ func TestGetHostPortConflicts(t *testing.T) {
assert.True(t, hasHostPortConflicts(pods), "Should have port conflicts") assert.True(t, hasHostPortConflicts(pods), "Should have port conflicts")
} }
func TestMakeDevices(t *testing.T) {
testCases := []struct {
container *v1.Container
devices []kubecontainer.DeviceInfo
test string
}{
{
test: "no device",
container: &v1.Container{},
devices: nil,
},
{
test: "gpu",
container: &v1.Container{
Resources: v1.ResourceRequirements{
Limits: map[v1.ResourceName]resource.Quantity{
v1.ResourceNvidiaGPU: resource.MustParse("1000"),
},
},
},
devices: []kubecontainer.DeviceInfo{
{PathOnHost: "/dev/nvidia0", PathInContainer: "/dev/nvidia0", Permissions: "mrw"},
{PathOnHost: "/dev/nvidiactl", PathInContainer: "/dev/nvidiactl", Permissions: "mrw"},
{PathOnHost: "/dev/nvidia-uvm", PathInContainer: "/dev/nvidia-uvm", Permissions: "mrw"},
},
},
}
for _, test := range testCases {
assert.Equal(t, test.devices, makeDevices(test.container), "[test %q]", test.test)
}
}
func TestHasHostMountPVC(t *testing.T) { func TestHasHostMountPVC(t *testing.T) {
tests := map[string]struct { tests := map[string]struct {
pvError error pvError error

View File

@ -49,6 +49,7 @@ import (
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
containertest "k8s.io/kubernetes/pkg/kubelet/container/testing" containertest "k8s.io/kubernetes/pkg/kubelet/container/testing"
"k8s.io/kubernetes/pkg/kubelet/eviction" "k8s.io/kubernetes/pkg/kubelet/eviction"
"k8s.io/kubernetes/pkg/kubelet/gpu"
"k8s.io/kubernetes/pkg/kubelet/images" "k8s.io/kubernetes/pkg/kubelet/images"
"k8s.io/kubernetes/pkg/kubelet/lifecycle" "k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/network" "k8s.io/kubernetes/pkg/kubelet/network"
@ -272,7 +273,7 @@ func newTestKubeletWithImageList(
kubelet.AddPodSyncLoopHandler(activeDeadlineHandler) kubelet.AddPodSyncLoopHandler(activeDeadlineHandler)
kubelet.AddPodSyncHandler(activeDeadlineHandler) kubelet.AddPodSyncHandler(activeDeadlineHandler)
kubelet.gpuManager = gpu.NewGPUManagerStub()
return &TestKubelet{kubelet, fakeRuntime, mockCadvisor, fakeKubeClient, fakeMirrorClient, fakeClock, nil, plug} return &TestKubelet{kubelet, fakeRuntime, mockCadvisor, fakeKubeClient, fakeMirrorClient, fakeClock, nil, plug}
} }

View File

@ -150,7 +150,6 @@ func GetHollowKubeletConfig(
c.MaxContainerCount = 100 c.MaxContainerCount = 100
c.MaxOpenFiles = 1024 c.MaxOpenFiles = 1024
c.MaxPerPodContainerCount = 2 c.MaxPerPodContainerCount = 2
c.NvidiaGPUs = 0
c.RegisterNode = true c.RegisterNode = true
c.RegisterSchedulable = true c.RegisterSchedulable = true
c.RegistryBurst = 10 c.RegistryBurst = 10

View File

@ -224,7 +224,7 @@ func (c *PodClient) WaitForErrorEventOrSuccess(pod *v1.Pod) (*v1.Event, error) {
return ev, err return ev, err
} }
// MatchContainerOutput gest output of a container and match expected regexp in the output. // MatchContainerOutput gets output of a container and match expected regexp in the output.
func (c *PodClient) MatchContainerOutput(name string, containerName string, expectedRegexp string) error { func (c *PodClient) MatchContainerOutput(name string, containerName string, expectedRegexp string) error {
f := c.f f := c.f
output, err := GetPodLogs(f.ClientSet, f.Namespace.Name, name, containerName) output, err := GetPodLogs(f.ClientSet, f.Namespace.Name, name, containerName)

View File

@ -14,6 +14,7 @@ go_library(
"benchmark_util.go", "benchmark_util.go",
"container.go", "container.go",
"doc.go", "doc.go",
"gpus.go",
"image_list.go", "image_list.go",
"resource_collector.go", "resource_collector.go",
"simple_mount.go", "simple_mount.go",
@ -37,12 +38,14 @@ go_library(
"//vendor:github.com/onsi/gomega", "//vendor:github.com/onsi/gomega",
"//vendor:github.com/opencontainers/runc/libcontainer/cgroups", "//vendor:github.com/opencontainers/runc/libcontainer/cgroups",
"//vendor:k8s.io/apimachinery/pkg/api/errors", "//vendor:k8s.io/apimachinery/pkg/api/errors",
"//vendor:k8s.io/apimachinery/pkg/api/resource",
"//vendor:k8s.io/apimachinery/pkg/apis/meta/v1", "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
"//vendor:k8s.io/apimachinery/pkg/labels", "//vendor:k8s.io/apimachinery/pkg/labels",
"//vendor:k8s.io/apimachinery/pkg/util/runtime", "//vendor:k8s.io/apimachinery/pkg/util/runtime",
"//vendor:k8s.io/apimachinery/pkg/util/sets", "//vendor:k8s.io/apimachinery/pkg/util/sets",
"//vendor:k8s.io/apimachinery/pkg/util/uuid", "//vendor:k8s.io/apimachinery/pkg/util/uuid",
"//vendor:k8s.io/apimachinery/pkg/util/wait", "//vendor:k8s.io/apimachinery/pkg/util/wait",
"//vendor:k8s.io/client-go/pkg/api",
], ],
) )

135
test/e2e_node/gpus.go Normal file
View File

@ -0,0 +1,135 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package e2e_node
import (
"fmt"
"time"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/pkg/api"
"k8s.io/kubernetes/pkg/api/v1"
"k8s.io/kubernetes/pkg/apis/componentconfig"
"k8s.io/kubernetes/test/e2e/framework"
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
)
const acceleratorsFeatureGate = "Accelerators=true"
// Serial because the test updates kubelet configuration.
var _ = framework.KubeDescribe("GPU [Serial]", func() {
f := framework.NewDefaultFramework("gpu-test")
Context("attempt to use GPUs if available", func() {
It("setup the node and create pods to test gpus", func() {
By("ensuring that dynamic kubelet configuration is enabled")
enabled, err := isKubeletConfigEnabled(f)
framework.ExpectNoError(err)
if !enabled {
Skip("Dynamic Kubelet configuration is not enabled. Skipping test.")
}
By("enabling support for GPUs")
var oldCfg *componentconfig.KubeletConfiguration
defer func() {
if oldCfg != nil {
framework.ExpectNoError(setKubeletConfiguration(f, oldCfg))
}
}()
oldCfg, err = getCurrentKubeletConfig()
framework.ExpectNoError(err)
clone, err := api.Scheme.DeepCopy(oldCfg)
framework.ExpectNoError(err)
newCfg := clone.(*componentconfig.KubeletConfiguration)
if newCfg.FeatureGates != "" {
newCfg.FeatureGates = fmt.Sprintf("%s,%s", acceleratorsFeatureGate, newCfg.FeatureGates)
} else {
newCfg.FeatureGates = acceleratorsFeatureGate
}
framework.ExpectNoError(setKubeletConfiguration(f, newCfg))
By("Getting the local node object from the api server")
nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
framework.ExpectNoError(err, "getting node list")
Expect(len(nodeList.Items)).To(Equal(1))
node := nodeList.Items[0]
gpusAvailable := node.Status.Capacity.NvidiaGPU()
By("Skipping the test if GPUs aren't available")
if gpusAvailable.IsZero() {
Skip("No GPUs available on local node. Skipping test.")
}
By("Creating a pod that will consume all GPUs")
podSuccess := makePod(gpusAvailable.Value(), "gpus-success")
podSuccess = f.PodClient().CreateSync(podSuccess)
By("Checking if the pod outputted Success to its logs")
framework.ExpectNoError(f.PodClient().MatchContainerOutput(podSuccess.Name, podSuccess.Name, "Success"))
By("Creating a new pod requesting a GPU and noticing that it is rejected by the Kubelet")
podFailure := makePod(1, "gpu-failure")
framework.WaitForPodCondition(f.ClientSet, f.Namespace.Name, podFailure.Name, "pod rejected", framework.PodStartTimeout, func(pod *v1.Pod) (bool, error) {
if pod.Status.Phase == v1.PodFailed {
return true, nil
}
return false, nil
})
By("stopping the original Pod with GPUs")
gp := int64(0)
deleteOptions := metav1.DeleteOptions{
GracePeriodSeconds: &gp,
}
f.PodClient().DeleteSync(podSuccess.Name, &deleteOptions, 30*time.Second)
By("attempting to start the failed pod again")
f.PodClient().DeleteSync(podFailure.Name, &deleteOptions, 10*time.Second)
podFailure = f.PodClient().CreateSync(podFailure)
By("Checking if the pod outputted Success to its logs")
framework.ExpectNoError(f.PodClient().MatchContainerOutput(podFailure.Name, podFailure.Name, "Success"))
})
})
})
func makePod(gpus int64, name string) *v1.Pod {
resources := v1.ResourceRequirements{
Limits: v1.ResourceList{
v1.ResourceNvidiaGPU: *resource.NewQuantity(gpus, resource.DecimalSI),
},
}
gpuverificationCmd := fmt.Sprintf("if [[ %d -ne $(ls /dev/ | egrep '^nvidia[0-9]+$') ]]; then exit 1; fi; echo Success; sleep 10240 ", gpus)
return &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: name,
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Image: "gcr.io/google_containers/busybox:1.24",
Name: name,
Command: []string{"sh", "-c", gpuverificationCmd},
Resources: resources,
},
},
},
}
}