Merge pull request #42116 from vishh/gpu-experimental-support
Automatic merge from submit-queue Extend experimental support to multiple Nvidia GPUs Extended from #28216 ```release-note `--experimental-nvidia-gpus` flag is **replaced** by `Accelerators` alpha feature gate along with support for multiple Nvidia GPUs. To use GPUs, pass `Accelerators=true` as part of `--feature-gates` flag. Works only with Docker runtime. ``` 1. Automated testing for this PR is not possible since creation of clusters with GPUs isn't supported yet in GCP. 1. To test this PR locally, use the node e2e. ```shell TEST_ARGS='--feature-gates=DynamicKubeletConfig=true' FOCUS=GPU SKIP="" make test-e2e-node ``` TODO: - [x] Run manual tests - [x] Add node e2e - [x] Add unit tests for GPU manager (< 100% coverage) - [ ] Add unit tests in kubelet package
This commit is contained in:
commit
ed479163fa
@ -206,7 +206,6 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) {
|
||||
fs.BoolVar(&s.BabysitDaemons, "babysit-daemons", s.BabysitDaemons, "If true, the node has babysitter process monitoring docker and kubelet.")
|
||||
fs.MarkDeprecated("babysit-daemons", "Will be removed in a future version.")
|
||||
fs.Int32Var(&s.MaxPods, "max-pods", s.MaxPods, "Number of Pods that can run on this Kubelet.")
|
||||
fs.Int32Var(&s.NvidiaGPUs, "experimental-nvidia-gpus", s.NvidiaGPUs, "Number of NVIDIA GPU devices on this node. Only 0 (default) and 1 are currently supported.")
|
||||
// TODO(#40229): Remove the docker-exec-handler flag.
|
||||
fs.StringVar(&s.DockerExecHandlerName, "docker-exec-handler", s.DockerExecHandlerName, "Handler to use when executing a command in a container. Valid values are 'native' and 'nsenter'. Defaults to 'native'.")
|
||||
fs.MarkDeprecated("docker-exec-handler", "this flag will be removed and only the 'native' handler will be supported in the future.")
|
||||
|
@ -690,3 +690,4 @@ windows-line-endings
|
||||
www-prefix
|
||||
zone-id
|
||||
zone-name
|
||||
|
||||
|
@ -362,8 +362,6 @@ type KubeletConfiguration struct {
|
||||
BabysitDaemons bool
|
||||
// maxPods is the number of pods that can run on this Kubelet.
|
||||
MaxPods int32
|
||||
// nvidiaGPUs is the number of NVIDIA GPU devices on this node.
|
||||
NvidiaGPUs int32
|
||||
// dockerExecHandlerName is the handler to use when executing a command
|
||||
// in a container. Valid values are 'native' and 'nsenter'. Defaults to
|
||||
// 'native'.
|
||||
|
@ -407,8 +407,6 @@ type KubeletConfiguration struct {
|
||||
BabysitDaemons bool `json:"babysitDaemons"`
|
||||
// maxPods is the number of pods that can run on this Kubelet.
|
||||
MaxPods int32 `json:"maxPods"`
|
||||
// nvidiaGPUs is the number of NVIDIA GPU devices on this node.
|
||||
NvidiaGPUs int32 `json:"nvidiaGPUs"`
|
||||
// dockerExecHandlerName is the handler to use when executing a command
|
||||
// in a container. Valid values are 'native' and 'nsenter'. Defaults to
|
||||
// 'native'.
|
||||
|
@ -353,7 +353,6 @@ func autoConvert_v1alpha1_KubeletConfiguration_To_componentconfig_KubeletConfigu
|
||||
out.HairpinMode = in.HairpinMode
|
||||
out.BabysitDaemons = in.BabysitDaemons
|
||||
out.MaxPods = in.MaxPods
|
||||
out.NvidiaGPUs = in.NvidiaGPUs
|
||||
out.DockerExecHandlerName = in.DockerExecHandlerName
|
||||
out.PodCIDR = in.PodCIDR
|
||||
out.ResolverConfig = in.ResolverConfig
|
||||
@ -531,7 +530,6 @@ func autoConvert_componentconfig_KubeletConfiguration_To_v1alpha1_KubeletConfigu
|
||||
out.HairpinMode = in.HairpinMode
|
||||
out.BabysitDaemons = in.BabysitDaemons
|
||||
out.MaxPods = in.MaxPods
|
||||
out.NvidiaGPUs = in.NvidiaGPUs
|
||||
out.DockerExecHandlerName = in.DockerExecHandlerName
|
||||
out.PodCIDR = in.PodCIDR
|
||||
out.ResolverConfig = in.ResolverConfig
|
||||
|
@ -73,6 +73,14 @@ const (
|
||||
// Determines if affinity defined in annotations should be processed
|
||||
// TODO: remove when alpha support for affinity is removed
|
||||
AffinityInAnnotations utilfeature.Feature = "AffinityInAnnotations"
|
||||
|
||||
// owner: @vishh
|
||||
// alpha: v1.6
|
||||
//
|
||||
// Enables support for GPUs as a schedulable resource.
|
||||
// Only Nvidia GPUs are supported as of v1.6.
|
||||
// Works only with Docker Container Runtime.
|
||||
Accelerators utilfeature.Feature = "Accelerators"
|
||||
)
|
||||
|
||||
func init() {
|
||||
@ -90,6 +98,7 @@ var defaultKubernetesFeatureGates = map[utilfeature.Feature]utilfeature.FeatureS
|
||||
ExperimentalHostUserNamespaceDefaultingGate: {Default: false, PreRelease: utilfeature.Beta},
|
||||
ExperimentalCriticalPodAnnotation: {Default: false, PreRelease: utilfeature.Alpha},
|
||||
AffinityInAnnotations: {Default: false, PreRelease: utilfeature.Alpha},
|
||||
Accelerators: {Default: false, PreRelease: utilfeature.Alpha},
|
||||
|
||||
// inherited features from generic apiserver, relisted here to get a conflict if it is changed
|
||||
// unintentionally on either side:
|
||||
|
@ -13153,13 +13153,6 @@ func GetOpenAPIDefinitions(ref openapi.ReferenceCallback) map[string]openapi.Ope
|
||||
Format: "int32",
|
||||
},
|
||||
},
|
||||
"nvidiaGPUs": {
|
||||
SchemaProps: spec.SchemaProps{
|
||||
Description: "nvidiaGPUs is the number of NVIDIA GPU devices on this node.",
|
||||
Type: []string{"integer"},
|
||||
Format: "int32",
|
||||
},
|
||||
},
|
||||
"dockerExecHandlerName": {
|
||||
SchemaProps: spec.SchemaProps{
|
||||
Description: "dockerExecHandlerName is the handler to use when executing a command in a container. Valid values are 'native' and 'nsenter'. Defaults to 'native'.",
|
||||
@ -13494,7 +13487,7 @@ func GetOpenAPIDefinitions(ref openapi.ReferenceCallback) map[string]openapi.Ope
|
||||
},
|
||||
},
|
||||
},
|
||||
Required: []string{"podManifestPath", "syncFrequency", "fileCheckFrequency", "httpCheckFrequency", "manifestURL", "manifestURLHeader", "enableServer", "address", "port", "readOnlyPort", "tlsCertFile", "tlsPrivateKeyFile", "certDirectory", "authentication", "authorization", "hostnameOverride", "podInfraContainerImage", "dockerEndpoint", "rootDirectory", "seccompProfileRoot", "allowPrivileged", "hostNetworkSources", "hostPIDSources", "hostIPCSources", "registryPullQPS", "registryBurst", "eventRecordQPS", "eventBurst", "enableDebuggingHandlers", "minimumGCAge", "maxPerPodContainerCount", "maxContainerCount", "cAdvisorPort", "healthzPort", "healthzBindAddress", "oomScoreAdj", "registerNode", "clusterDomain", "masterServiceNamespace", "clusterDNS", "streamingConnectionIdleTimeout", "nodeStatusUpdateFrequency", "imageMinimumGCAge", "imageGCHighThresholdPercent", "imageGCLowThresholdPercent", "lowDiskSpaceThresholdMB", "volumeStatsAggPeriod", "networkPluginName", "networkPluginDir", "cniConfDir", "cniBinDir", "networkPluginMTU", "volumePluginDir", "cloudProvider", "cloudConfigFile", "kubeletCgroups", "runtimeCgroups", "systemCgroups", "cgroupRoot", "containerRuntime", "remoteRuntimeEndpoint", "remoteImageEndpoint", "runtimeRequestTimeout", "rktPath", "rktAPIEndpoint", "rktStage1Image", "lockFilePath", "exitOnLockContention", "hairpinMode", "babysitDaemons", "maxPods", "nvidiaGPUs", "dockerExecHandlerName", "podCIDR", "resolvConf", "cpuCFSQuota", "containerized", "maxOpenFiles", "registerSchedulable", "registerWithTaints", "contentType", "kubeAPIQPS", "kubeAPIBurst", "serializeImagePulls", "outOfDiskTransitionFrequency", "nodeIP", "nodeLabels", "nonMasqueradeCIDR", "enableCustomMetrics", "evictionHard", "evictionSoft", "evictionSoftGracePeriod", "evictionPressureTransitionPeriod", "evictionMaxPodGracePeriod", "evictionMinimumReclaim", "experimentalKernelMemcgNotification", "podsPerCore", "enableControllerAttachDetach", "protectKernelDefaults", "makeIPTablesUtilChains", "iptablesMasqueradeBit", "iptablesDropBit", "systemReserved", "kubeReserved"},
|
||||
Required: []string{"podManifestPath", "syncFrequency", "fileCheckFrequency", "httpCheckFrequency", "manifestURL", "manifestURLHeader", "enableServer", "address", "port", "readOnlyPort", "tlsCertFile", "tlsPrivateKeyFile", "certDirectory", "authentication", "authorization", "hostnameOverride", "podInfraContainerImage", "dockerEndpoint", "rootDirectory", "seccompProfileRoot", "allowPrivileged", "hostNetworkSources", "hostPIDSources", "hostIPCSources", "registryPullQPS", "registryBurst", "eventRecordQPS", "eventBurst", "enableDebuggingHandlers", "minimumGCAge", "maxPerPodContainerCount", "maxContainerCount", "cAdvisorPort", "healthzPort", "healthzBindAddress", "oomScoreAdj", "registerNode", "clusterDomain", "masterServiceNamespace", "clusterDNS", "streamingConnectionIdleTimeout", "nodeStatusUpdateFrequency", "imageMinimumGCAge", "imageGCHighThresholdPercent", "imageGCLowThresholdPercent", "lowDiskSpaceThresholdMB", "volumeStatsAggPeriod", "networkPluginName", "networkPluginDir", "cniConfDir", "cniBinDir", "networkPluginMTU", "volumePluginDir", "cloudProvider", "cloudConfigFile", "kubeletCgroups", "runtimeCgroups", "systemCgroups", "cgroupRoot", "containerRuntime", "remoteRuntimeEndpoint", "remoteImageEndpoint", "runtimeRequestTimeout", "rktPath", "rktAPIEndpoint", "rktStage1Image", "lockFilePath", "exitOnLockContention", "hairpinMode", "babysitDaemons", "maxPods", "dockerExecHandlerName", "podCIDR", "resolvConf", "cpuCFSQuota", "containerized", "maxOpenFiles", "registerSchedulable", "registerWithTaints", "contentType", "kubeAPIQPS", "kubeAPIBurst", "serializeImagePulls", "outOfDiskTransitionFrequency", "nodeIP", "nodeLabels", "nonMasqueradeCIDR", "enableCustomMetrics", "evictionHard", "evictionSoft", "evictionSoftGracePeriod", "evictionPressureTransitionPeriod", "evictionMaxPodGracePeriod", "evictionMinimumReclaim", "experimentalKernelMemcgNotification", "podsPerCore", "enableControllerAttachDetach", "protectKernelDefaults", "makeIPTablesUtilChains", "iptablesMasqueradeBit", "iptablesDropBit", "systemReserved", "kubeReserved"},
|
||||
},
|
||||
},
|
||||
Dependencies: []string{
|
||||
|
@ -58,6 +58,8 @@ go_library(
|
||||
"//pkg/kubelet/envvars:go_default_library",
|
||||
"//pkg/kubelet/events:go_default_library",
|
||||
"//pkg/kubelet/eviction:go_default_library",
|
||||
"//pkg/kubelet/gpu:go_default_library",
|
||||
"//pkg/kubelet/gpu/nvidia:go_default_library",
|
||||
"//pkg/kubelet/images:go_default_library",
|
||||
"//pkg/kubelet/kuberuntime:go_default_library",
|
||||
"//pkg/kubelet/lifecycle:go_default_library",
|
||||
@ -169,6 +171,7 @@ go_test(
|
||||
"//pkg/kubelet/container:go_default_library",
|
||||
"//pkg/kubelet/container/testing:go_default_library",
|
||||
"//pkg/kubelet/eviction:go_default_library",
|
||||
"//pkg/kubelet/gpu:go_default_library",
|
||||
"//pkg/kubelet/images:go_default_library",
|
||||
"//pkg/kubelet/lifecycle:go_default_library",
|
||||
"//pkg/kubelet/network:go_default_library",
|
||||
@ -246,6 +249,7 @@ filegroup(
|
||||
"//pkg/kubelet/envvars:all-srcs",
|
||||
"//pkg/kubelet/events:all-srcs",
|
||||
"//pkg/kubelet/eviction:all-srcs",
|
||||
"//pkg/kubelet/gpu:all-srcs",
|
||||
"//pkg/kubelet/images:all-srcs",
|
||||
"//pkg/kubelet/kuberuntime:all-srcs",
|
||||
"//pkg/kubelet/leaky:all-srcs",
|
||||
|
34
pkg/kubelet/gpu/BUILD
Normal file
34
pkg/kubelet/gpu/BUILD
Normal file
@ -0,0 +1,34 @@
|
||||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
licenses(["notice"])
|
||||
|
||||
load(
|
||||
"@io_bazel_rules_go//go:def.bzl",
|
||||
"go_library",
|
||||
)
|
||||
|
||||
go_library(
|
||||
name = "go_default_library",
|
||||
srcs = [
|
||||
"gpu_manager_stub.go",
|
||||
"types.go",
|
||||
],
|
||||
tags = ["automanaged"],
|
||||
deps = ["//pkg/api/v1:go_default_library"],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "package-srcs",
|
||||
srcs = glob(["**"]),
|
||||
tags = ["automanaged"],
|
||||
visibility = ["//visibility:private"],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "all-srcs",
|
||||
srcs = [
|
||||
":package-srcs",
|
||||
"//pkg/kubelet/gpu/nvidia:all-srcs",
|
||||
],
|
||||
tags = ["automanaged"],
|
||||
)
|
41
pkg/kubelet/gpu/gpu_manager_stub.go
Normal file
41
pkg/kubelet/gpu/gpu_manager_stub.go
Normal file
@ -0,0 +1,41 @@
|
||||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package gpu
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"k8s.io/kubernetes/pkg/api/v1"
|
||||
)
|
||||
|
||||
type gpuManagerStub struct{}
|
||||
|
||||
func (gms *gpuManagerStub) Start() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (gms *gpuManagerStub) Capacity() v1.ResourceList {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (gms *gpuManagerStub) AllocateGPU(_ *v1.Pod, _ *v1.Container) ([]string, error) {
|
||||
return nil, fmt.Errorf("GPUs are not supported")
|
||||
}
|
||||
|
||||
func NewGPUManagerStub() GPUManager {
|
||||
return &gpuManagerStub{}
|
||||
}
|
54
pkg/kubelet/gpu/nvidia/BUILD
Normal file
54
pkg/kubelet/gpu/nvidia/BUILD
Normal file
@ -0,0 +1,54 @@
|
||||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
licenses(["notice"])
|
||||
|
||||
load(
|
||||
"@io_bazel_rules_go//go:def.bzl",
|
||||
"go_library",
|
||||
"go_test",
|
||||
)
|
||||
|
||||
go_library(
|
||||
name = "go_default_library",
|
||||
srcs = [
|
||||
"helpers.go",
|
||||
"nvidia_gpu_manager.go",
|
||||
],
|
||||
tags = ["automanaged"],
|
||||
deps = [
|
||||
"//pkg/api/v1:go_default_library",
|
||||
"//pkg/kubelet/dockertools:go_default_library",
|
||||
"//pkg/kubelet/gpu:go_default_library",
|
||||
"//vendor:github.com/golang/glog",
|
||||
"//vendor:k8s.io/apimachinery/pkg/api/resource",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/sets",
|
||||
],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "package-srcs",
|
||||
srcs = glob(["**"]),
|
||||
tags = ["automanaged"],
|
||||
visibility = ["//visibility:private"],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "all-srcs",
|
||||
srcs = [":package-srcs"],
|
||||
tags = ["automanaged"],
|
||||
)
|
||||
|
||||
go_test(
|
||||
name = "go_default_test",
|
||||
srcs = ["nvidia_gpu_manager_test.go"],
|
||||
library = ":go_default_library",
|
||||
tags = ["automanaged"],
|
||||
deps = [
|
||||
"//pkg/api/v1:go_default_library",
|
||||
"//vendor:github.com/stretchr/testify/assert",
|
||||
"//vendor:k8s.io/apimachinery/pkg/api/resource",
|
||||
"//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/sets",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/uuid",
|
||||
],
|
||||
)
|
59
pkg/kubelet/gpu/nvidia/helpers.go
Normal file
59
pkg/kubelet/gpu/nvidia/helpers.go
Normal file
@ -0,0 +1,59 @@
|
||||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package nvidia
|
||||
|
||||
import "k8s.io/apimachinery/pkg/util/sets"
|
||||
|
||||
// podGPUs represents a list of pod to GPU mappings.
|
||||
type podGPUs struct {
|
||||
podGPUMapping map[string]sets.String
|
||||
}
|
||||
|
||||
func newPodGPUs() *podGPUs {
|
||||
return &podGPUs{
|
||||
podGPUMapping: map[string]sets.String{},
|
||||
}
|
||||
}
|
||||
func (pgpu *podGPUs) pods() sets.String {
|
||||
ret := sets.NewString()
|
||||
for k := range pgpu.podGPUMapping {
|
||||
ret.Insert(k)
|
||||
}
|
||||
return ret
|
||||
}
|
||||
|
||||
func (pgpu *podGPUs) insert(podUID string, device string) {
|
||||
if _, exists := pgpu.podGPUMapping[podUID]; !exists {
|
||||
pgpu.podGPUMapping[podUID] = sets.NewString(device)
|
||||
} else {
|
||||
pgpu.podGPUMapping[podUID].Insert(device)
|
||||
}
|
||||
}
|
||||
|
||||
func (pgpu *podGPUs) delete(pods []string) {
|
||||
for _, uid := range pods {
|
||||
delete(pgpu.podGPUMapping, uid)
|
||||
}
|
||||
}
|
||||
|
||||
func (pgpu *podGPUs) devices() sets.String {
|
||||
ret := sets.NewString()
|
||||
for _, devices := range pgpu.podGPUMapping {
|
||||
ret = ret.Union(devices)
|
||||
}
|
||||
return ret
|
||||
}
|
279
pkg/kubelet/gpu/nvidia/nvidia_gpu_manager.go
Normal file
279
pkg/kubelet/gpu/nvidia/nvidia_gpu_manager.go
Normal file
@ -0,0 +1,279 @@
|
||||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package nvidia
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path"
|
||||
"regexp"
|
||||
"sync"
|
||||
|
||||
"github.com/golang/glog"
|
||||
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
"k8s.io/kubernetes/pkg/api/v1"
|
||||
"k8s.io/kubernetes/pkg/kubelet/dockertools"
|
||||
"k8s.io/kubernetes/pkg/kubelet/gpu"
|
||||
)
|
||||
|
||||
// TODO: rework to use Nvidia's NVML, which is more complex, but also provides more fine-grained information and stats.
|
||||
const (
|
||||
// All NVIDIA GPUs cards should be mounted with nvidiactl and nvidia-uvm
|
||||
// If the driver installed correctly, the 2 devices will be there.
|
||||
nvidiaCtlDevice string = "/dev/nvidiactl"
|
||||
nvidiaUVMDevice string = "/dev/nvidia-uvm"
|
||||
// Optional device.
|
||||
nvidiaUVMToolsDevice string = "/dev/nvidia-uvm-tools"
|
||||
devDirectory = "/dev"
|
||||
nvidiaDeviceRE = `^nvidia[0-9]*$`
|
||||
nvidiaFullpathRE = `^/dev/nvidia[0-9]*$`
|
||||
)
|
||||
|
||||
type activePodsLister interface {
|
||||
// Returns a list of active pods on the node.
|
||||
GetRunningPods() ([]*v1.Pod, error)
|
||||
}
|
||||
|
||||
// nvidiaGPUManager manages nvidia gpu devices.
|
||||
type nvidiaGPUManager struct {
|
||||
sync.Mutex
|
||||
// All gpus available on the Node
|
||||
allGPUs sets.String
|
||||
allocated *podGPUs
|
||||
defaultDevices []string
|
||||
// The interface which could get GPU mapping from all the containers.
|
||||
// TODO: Should make this independent of Docker in the future.
|
||||
dockerClient dockertools.DockerInterface
|
||||
activePodsLister activePodsLister
|
||||
}
|
||||
|
||||
// NewNvidiaGPUManager returns a GPUManager that manages local Nvidia GPUs.
|
||||
// TODO: Migrate to use pod level cgroups and make it generic to all runtimes.
|
||||
func NewNvidiaGPUManager(activePodsLister activePodsLister, dockerClient dockertools.DockerInterface) (gpu.GPUManager, error) {
|
||||
if dockerClient == nil {
|
||||
return nil, fmt.Errorf("invalid docker client specified")
|
||||
}
|
||||
return &nvidiaGPUManager{
|
||||
allGPUs: sets.NewString(),
|
||||
dockerClient: dockerClient,
|
||||
activePodsLister: activePodsLister,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Initialize the GPU devices, so far only needed to discover the GPU paths.
|
||||
func (ngm *nvidiaGPUManager) Start() error {
|
||||
if ngm.dockerClient == nil {
|
||||
return fmt.Errorf("Invalid docker client specified in GPU Manager")
|
||||
}
|
||||
ngm.Lock()
|
||||
defer ngm.Unlock()
|
||||
|
||||
if _, err := os.Stat(nvidiaCtlDevice); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if _, err := os.Stat(nvidiaUVMDevice); err != nil {
|
||||
return err
|
||||
}
|
||||
ngm.defaultDevices = []string{nvidiaCtlDevice, nvidiaUVMDevice}
|
||||
_, err := os.Stat(nvidiaUVMToolsDevice)
|
||||
if !os.IsNotExist(err) {
|
||||
ngm.defaultDevices = append(ngm.defaultDevices, nvidiaUVMToolsDevice)
|
||||
}
|
||||
|
||||
if err := ngm.discoverGPUs(); err != nil {
|
||||
return err
|
||||
}
|
||||
// It's possible that the runtime isn't available now.
|
||||
allocatedGPUs, err := ngm.gpusInUse()
|
||||
if err == nil {
|
||||
ngm.allocated = allocatedGPUs
|
||||
}
|
||||
// We ignore errors when identifying allocated GPUs because it is possible that the runtime interfaces may be not be logically up.
|
||||
return nil
|
||||
}
|
||||
|
||||
// Get how many GPU cards we have.
|
||||
func (ngm *nvidiaGPUManager) Capacity() v1.ResourceList {
|
||||
gpus := resource.NewQuantity(int64(len(ngm.allGPUs)), resource.DecimalSI)
|
||||
return v1.ResourceList{
|
||||
v1.ResourceNvidiaGPU: *gpus,
|
||||
}
|
||||
}
|
||||
|
||||
// AllocateGPUs returns `num` GPUs if available, error otherwise.
|
||||
// Allocation is made thread safe using the following logic.
|
||||
// A list of all GPUs allocated is maintained along with their respective Pod UIDs.
|
||||
// It is expected that the list of active pods will not return any false positives.
|
||||
// As part of initialization or allocation, the list of GPUs in use will be computed once.
|
||||
// Whenever an allocation happens, the list of GPUs allocated is updated based on the list of currently active pods.
|
||||
// GPUs allocated to terminated pods are freed up lazily as part of allocation.
|
||||
// GPUs are allocated based on the internal list of allocatedGPUs.
|
||||
// It is not safe to generate a list of GPUs in use by inspecting active containers because of the delay between GPU allocation and container creation.
|
||||
// A GPU allocated to a container might be re-allocated to a subsequent container because the original container wasn't started quick enough.
|
||||
// The current algorithm scans containers only once and then uses a list of active pods to track GPU usage.
|
||||
// This is a sub-optimal solution and a better alternative would be that of using pod level cgroups instead.
|
||||
// GPUs allocated to containers should be reflected in pod level device cgroups before completing allocations.
|
||||
// The pod level cgroups will then serve as a checkpoint of GPUs in use.
|
||||
func (ngm *nvidiaGPUManager) AllocateGPU(pod *v1.Pod, container *v1.Container) ([]string, error) {
|
||||
gpusNeeded := container.Resources.Limits.NvidiaGPU().Value()
|
||||
if gpusNeeded == 0 {
|
||||
return []string{}, nil
|
||||
}
|
||||
ngm.Lock()
|
||||
defer ngm.Unlock()
|
||||
if ngm.allocated == nil {
|
||||
// Initialization is not complete. Try now. Failures can no longer be tolerated.
|
||||
allocated, err := ngm.gpusInUse()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Failed to allocate GPUs because of issues identifying GPUs in use: %v", err)
|
||||
}
|
||||
ngm.allocated = allocated
|
||||
} else {
|
||||
// update internal list of GPUs in use prior to allocating new GPUs.
|
||||
if err := ngm.updateAllocatedGPUs(); err != nil {
|
||||
return nil, fmt.Errorf("Failed to allocate GPUs because of issues with updating GPUs in use: %v", err)
|
||||
}
|
||||
}
|
||||
// Get GPU devices in use.
|
||||
devicesInUse := ngm.allocated.devices()
|
||||
glog.V(5).Infof("gpus in use: %v", devicesInUse.List())
|
||||
// Get a list of available GPUs.
|
||||
available := ngm.allGPUs.Difference(devicesInUse)
|
||||
glog.V(5).Infof("gpus available: %v", available.List())
|
||||
if int64(available.Len()) < gpusNeeded {
|
||||
return nil, fmt.Errorf("requested number of GPUs unavailable. Requested: %d, Available: %d", gpusNeeded, available.Len())
|
||||
}
|
||||
ret := available.UnsortedList()[:gpusNeeded]
|
||||
for _, device := range ret {
|
||||
// Update internal allocated GPU cache.
|
||||
ngm.allocated.insert(string(pod.UID), device)
|
||||
}
|
||||
// Add standard devices files that needs to be exposed.
|
||||
ret = append(ret, ngm.defaultDevices...)
|
||||
|
||||
return ret, nil
|
||||
}
|
||||
|
||||
// updateAllocatedGPUs updates the list of GPUs in use.
|
||||
// It gets a list of running pods and then frees any GPUs that are bound to terminated pods.
|
||||
// Returns error on failure.
|
||||
func (ngm *nvidiaGPUManager) updateAllocatedGPUs() error {
|
||||
activePods, err := ngm.activePodsLister.GetRunningPods()
|
||||
if err != nil {
|
||||
return fmt.Errorf("Failed to list active pods: %v", err)
|
||||
}
|
||||
activePodUids := sets.NewString()
|
||||
for _, pod := range activePods {
|
||||
activePodUids.Insert(string(pod.UID))
|
||||
}
|
||||
allocatedPodUids := ngm.allocated.pods()
|
||||
podsToBeRemoved := allocatedPodUids.Difference(activePodUids)
|
||||
glog.V(5).Infof("pods to be removed: %v", podsToBeRemoved.List())
|
||||
ngm.allocated.delete(podsToBeRemoved.List())
|
||||
return nil
|
||||
}
|
||||
|
||||
// discoverGPUs identifies allGPUs NVIDIA GPU devices available on the local node by walking `/dev` directory.
|
||||
// TODO: Without NVML support we only can check whether there has GPU devices, but
|
||||
// could not give a health check or get more information like GPU cores, memory, or
|
||||
// family name. Need to support NVML in the future. But we do not need NVML until
|
||||
// we want more features, features like schedule containers according to GPU family
|
||||
// name.
|
||||
func (ngm *nvidiaGPUManager) discoverGPUs() error {
|
||||
reg := regexp.MustCompile(nvidiaDeviceRE)
|
||||
files, err := ioutil.ReadDir(devDirectory)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, f := range files {
|
||||
if f.IsDir() {
|
||||
continue
|
||||
}
|
||||
if reg.MatchString(f.Name()) {
|
||||
glog.V(2).Infof("Found Nvidia GPU %q", f.Name())
|
||||
ngm.allGPUs.Insert(path.Join(devDirectory, f.Name()))
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// gpusInUse returns a list of GPUs in use along with the respective pods that are using it.
|
||||
func (ngm *nvidiaGPUManager) gpusInUse() (*podGPUs, error) {
|
||||
pods, err := ngm.activePodsLister.GetRunningPods()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
type podContainers struct {
|
||||
uid string
|
||||
containerIDs sets.String
|
||||
}
|
||||
// List of containers to inspect.
|
||||
podContainersToInspect := []podContainers{}
|
||||
for _, pod := range pods {
|
||||
containers := sets.NewString()
|
||||
for _, container := range pod.Spec.Containers {
|
||||
// GPUs are expected to be specified only in limits.
|
||||
if !container.Resources.Limits.NvidiaGPU().IsZero() {
|
||||
containers.Insert(container.Name)
|
||||
}
|
||||
}
|
||||
// If no GPUs were requested skip this pod.
|
||||
if containers.Len() == 0 {
|
||||
continue
|
||||
}
|
||||
containerIDs := sets.NewString()
|
||||
for _, container := range pod.Status.ContainerStatuses {
|
||||
if containers.Has(container.Name) {
|
||||
containerIDs.Insert(container.ContainerID)
|
||||
}
|
||||
}
|
||||
// add the pod and its containers that need to be inspected.
|
||||
podContainersToInspect = append(podContainersToInspect, podContainers{string(pod.UID), containerIDs})
|
||||
}
|
||||
ret := newPodGPUs()
|
||||
for _, podContainer := range podContainersToInspect {
|
||||
for _, containerId := range podContainer.containerIDs.List() {
|
||||
containerJSON, err := ngm.dockerClient.InspectContainer(containerId)
|
||||
if err != nil {
|
||||
glog.V(3).Infof("Failed to inspect container %q in pod %q while attempting to reconcile nvidia gpus in use", containerId, podContainer.uid)
|
||||
continue
|
||||
}
|
||||
|
||||
devices := containerJSON.HostConfig.Devices
|
||||
if devices == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
for _, device := range devices {
|
||||
if isValidPath(device.PathOnHost) {
|
||||
glog.V(4).Infof("Nvidia GPU %q is in use by Docker Container: %q", device.PathOnHost, containerJSON.ID)
|
||||
ret.insert(podContainer.uid, device.PathOnHost)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret, nil
|
||||
}
|
||||
|
||||
func isValidPath(path string) bool {
|
||||
return regexp.MustCompile(nvidiaFullpathRE).MatchString(path)
|
||||
}
|
144
pkg/kubelet/gpu/nvidia/nvidia_gpu_manager_test.go
Normal file
144
pkg/kubelet/gpu/nvidia/nvidia_gpu_manager_test.go
Normal file
@ -0,0 +1,144 @@
|
||||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package nvidia
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
"k8s.io/apimachinery/pkg/util/uuid"
|
||||
"k8s.io/kubernetes/pkg/api/v1"
|
||||
)
|
||||
|
||||
type testActivePodsLister struct {
|
||||
activePods []*v1.Pod
|
||||
}
|
||||
|
||||
func (tapl *testActivePodsLister) GetRunningPods() ([]*v1.Pod, error) {
|
||||
return tapl.activePods, nil
|
||||
}
|
||||
|
||||
func makeTestPod(numContainers int) *v1.Pod {
|
||||
quantity := resource.NewQuantity(1, resource.DecimalSI)
|
||||
resources := v1.ResourceRequirements{
|
||||
Limits: v1.ResourceList{
|
||||
v1.ResourceNvidiaGPU: *quantity,
|
||||
},
|
||||
}
|
||||
pod := &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
UID: uuid.NewUUID(),
|
||||
},
|
||||
Spec: v1.PodSpec{
|
||||
Containers: []v1.Container{},
|
||||
},
|
||||
}
|
||||
for ; numContainers > 0; numContainers-- {
|
||||
pod.Spec.Containers = append(pod.Spec.Containers, v1.Container{
|
||||
Resources: resources,
|
||||
})
|
||||
}
|
||||
return pod
|
||||
}
|
||||
|
||||
func TestMultiContainerPodGPUAllocation(t *testing.T) {
|
||||
podLister := &testActivePodsLister{}
|
||||
|
||||
testGpuManager := &nvidiaGPUManager{
|
||||
activePodsLister: podLister,
|
||||
allGPUs: sets.NewString("/dev/nvidia0", "/dev/nvidia1"),
|
||||
allocated: newPodGPUs(),
|
||||
}
|
||||
|
||||
// Expect that no devices are in use.
|
||||
gpusInUse, err := testGpuManager.gpusInUse()
|
||||
as := assert.New(t)
|
||||
as.Nil(err)
|
||||
as.Equal(len(gpusInUse.devices()), 0)
|
||||
|
||||
// Allocated GPUs for a pod with two containers.
|
||||
pod := makeTestPod(2)
|
||||
// Allocate for the first container.
|
||||
devices1, err := testGpuManager.AllocateGPU(pod, &pod.Spec.Containers[0])
|
||||
as.Nil(err)
|
||||
as.Equal(len(devices1), 1)
|
||||
|
||||
podLister.activePods = append(podLister.activePods, pod)
|
||||
// Allocate for the second container.
|
||||
devices2, err := testGpuManager.AllocateGPU(pod, &pod.Spec.Containers[1])
|
||||
as.Nil(err)
|
||||
as.Equal(len(devices2), 1)
|
||||
|
||||
as.NotEqual(devices1, devices2, "expected containers to get different devices")
|
||||
|
||||
// further allocations should fail.
|
||||
newPod := makeTestPod(2)
|
||||
devices1, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[0])
|
||||
as.NotNil(err, "expected gpu allocation to fail. got: %v", devices1)
|
||||
|
||||
// Now terminate the original pod and observe that GPU allocation for new pod succeeds.
|
||||
podLister.activePods = podLister.activePods[:0]
|
||||
|
||||
devices1, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[0])
|
||||
as.Nil(err)
|
||||
as.Equal(len(devices1), 1)
|
||||
|
||||
podLister.activePods = append(podLister.activePods, newPod)
|
||||
|
||||
devices2, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[1])
|
||||
as.Nil(err)
|
||||
as.Equal(len(devices2), 1)
|
||||
|
||||
as.NotEqual(devices1, devices2, "expected containers to get different devices")
|
||||
}
|
||||
|
||||
func TestMultiPodGPUAllocation(t *testing.T) {
|
||||
podLister := &testActivePodsLister{}
|
||||
|
||||
testGpuManager := &nvidiaGPUManager{
|
||||
activePodsLister: podLister,
|
||||
allGPUs: sets.NewString("/dev/nvidia0", "/dev/nvidia1"),
|
||||
allocated: newPodGPUs(),
|
||||
}
|
||||
|
||||
// Expect that no devices are in use.
|
||||
gpusInUse, err := testGpuManager.gpusInUse()
|
||||
as := assert.New(t)
|
||||
as.Nil(err)
|
||||
as.Equal(len(gpusInUse.devices()), 0)
|
||||
|
||||
// Allocated GPUs for a pod with two containers.
|
||||
podA := makeTestPod(1)
|
||||
// Allocate for the first container.
|
||||
devicesA, err := testGpuManager.AllocateGPU(podA, &podA.Spec.Containers[0])
|
||||
as.Nil(err)
|
||||
as.Equal(len(devicesA), 1)
|
||||
|
||||
podLister.activePods = append(podLister.activePods, podA)
|
||||
|
||||
// further allocations should fail.
|
||||
podB := makeTestPod(1)
|
||||
// Allocate for the first container.
|
||||
devicesB, err := testGpuManager.AllocateGPU(podB, &podB.Spec.Containers[0])
|
||||
as.Nil(err)
|
||||
as.Equal(len(devicesB), 1)
|
||||
as.NotEqual(devicesA, devicesB, "expected pods to get different devices")
|
||||
}
|
32
pkg/kubelet/gpu/types.go
Normal file
32
pkg/kubelet/gpu/types.go
Normal file
@ -0,0 +1,32 @@
|
||||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package gpu
|
||||
|
||||
import "k8s.io/kubernetes/pkg/api/v1"
|
||||
|
||||
// GPUManager manages GPUs on a local node.
|
||||
// Implementations are expected to be thread safe.
|
||||
type GPUManager interface {
|
||||
// Start logically initializes GPUManager
|
||||
Start() error
|
||||
// Capacity returns the total number of GPUs on the node.
|
||||
Capacity() v1.ResourceList
|
||||
// AllocateGPU attempts to allocate GPUs for input container.
|
||||
// Returns paths to allocated GPUs and nil on success.
|
||||
// Returns an error on failure.
|
||||
AllocateGPU(*v1.Pod, *v1.Container) ([]string, error)
|
||||
}
|
@ -67,6 +67,8 @@ import (
|
||||
"k8s.io/kubernetes/pkg/kubelet/dockertools"
|
||||
"k8s.io/kubernetes/pkg/kubelet/events"
|
||||
"k8s.io/kubernetes/pkg/kubelet/eviction"
|
||||
"k8s.io/kubernetes/pkg/kubelet/gpu"
|
||||
"k8s.io/kubernetes/pkg/kubelet/gpu/nvidia"
|
||||
"k8s.io/kubernetes/pkg/kubelet/images"
|
||||
"k8s.io/kubernetes/pkg/kubelet/kuberuntime"
|
||||
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
||||
@ -450,7 +452,6 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub
|
||||
nonMasqueradeCIDR: kubeCfg.NonMasqueradeCIDR,
|
||||
maxPods: int(kubeCfg.MaxPods),
|
||||
podsPerCore: int(kubeCfg.PodsPerCore),
|
||||
nvidiaGPUs: int(kubeCfg.NvidiaGPUs),
|
||||
syncLoopMonitor: atomic.Value{},
|
||||
resolverConfig: kubeCfg.ResolverConfig,
|
||||
cpuCFSQuota: kubeCfg.CPUCFSQuota,
|
||||
@ -786,7 +787,16 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub
|
||||
|
||||
klet.appArmorValidator = apparmor.NewValidator(kubeCfg.ContainerRuntime)
|
||||
klet.softAdmitHandlers.AddPodAdmitHandler(lifecycle.NewAppArmorAdmitHandler(klet.appArmorValidator))
|
||||
|
||||
if utilfeature.DefaultFeatureGate.Enabled(features.Accelerators) {
|
||||
if kubeCfg.ContainerRuntime != "docker" {
|
||||
return nil, fmt.Errorf("Accelerators feature is supported with docker runtime only.")
|
||||
}
|
||||
if klet.gpuManager, err = nvidia.NewNvidiaGPUManager(klet, klet.dockerClient); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
} else {
|
||||
klet.gpuManager = gpu.NewGPUManagerStub()
|
||||
}
|
||||
// Finally, put the most recent version of the config on the Kubelet, so
|
||||
// people can see how it was configured.
|
||||
klet.kubeletConfiguration = *kubeCfg
|
||||
@ -981,9 +991,6 @@ type Kubelet struct {
|
||||
// Maximum Number of Pods which can be run by this Kubelet
|
||||
maxPods int
|
||||
|
||||
// Number of NVIDIA GPUs on this node
|
||||
nvidiaGPUs int
|
||||
|
||||
// Monitor Kubelet's sync loop
|
||||
syncLoopMonitor atomic.Value
|
||||
|
||||
@ -1089,6 +1096,9 @@ type Kubelet struct {
|
||||
// This should only be enabled when the container runtime is performing user remapping AND if the
|
||||
// experimental behavior is desired.
|
||||
experimentalHostUserNamespaceDefaulting bool
|
||||
|
||||
// GPU Manager
|
||||
gpuManager gpu.GPUManager
|
||||
}
|
||||
|
||||
// setupDataDirs creates:
|
||||
@ -1182,7 +1192,10 @@ func (kl *Kubelet) initializeModules() error {
|
||||
return fmt.Errorf("Failed to start OOM watcher %v", err)
|
||||
}
|
||||
|
||||
// Step 7: Start resource analyzer
|
||||
// Step 7: Initialize GPUs
|
||||
kl.gpuManager.Start()
|
||||
|
||||
// Step 8: Start resource analyzer
|
||||
kl.resourceAnalyzer.Start()
|
||||
|
||||
return nil
|
||||
|
@ -482,6 +482,14 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
|
||||
node.Status.Capacity = v1.ResourceList{}
|
||||
}
|
||||
|
||||
// populate GPU capacity.
|
||||
gpuCapacity := kl.gpuManager.Capacity()
|
||||
if gpuCapacity != nil {
|
||||
for k, v := range gpuCapacity {
|
||||
node.Status.Capacity[k] = v
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Post NotReady if we cannot get MachineInfo from cAdvisor. This needs to start
|
||||
// cAdvisor locally, e.g. for test-cmd.sh, and in integration test.
|
||||
info, err := kl.GetCachedMachineInfo()
|
||||
@ -491,8 +499,6 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
|
||||
node.Status.Capacity[v1.ResourceCPU] = *resource.NewMilliQuantity(0, resource.DecimalSI)
|
||||
node.Status.Capacity[v1.ResourceMemory] = resource.MustParse("0Gi")
|
||||
node.Status.Capacity[v1.ResourcePods] = *resource.NewQuantity(int64(kl.maxPods), resource.DecimalSI)
|
||||
node.Status.Capacity[v1.ResourceNvidiaGPU] = *resource.NewQuantity(int64(kl.nvidiaGPUs), resource.DecimalSI)
|
||||
|
||||
glog.Errorf("Error getting machine info: %v", err)
|
||||
} else {
|
||||
node.Status.NodeInfo.MachineID = info.MachineID
|
||||
@ -509,8 +515,6 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
|
||||
node.Status.Capacity[v1.ResourcePods] = *resource.NewQuantity(
|
||||
int64(kl.maxPods), resource.DecimalSI)
|
||||
}
|
||||
node.Status.Capacity[v1.ResourceNvidiaGPU] = *resource.NewQuantity(
|
||||
int64(kl.nvidiaGPUs), resource.DecimalSI)
|
||||
if node.Status.NodeInfo.BootID != "" &&
|
||||
node.Status.NodeInfo.BootID != info.BootID {
|
||||
// TODO: This requires a transaction, either both node status is updated
|
||||
|
@ -208,16 +208,14 @@ func TestUpdateNewNodeStatus(t *testing.T) {
|
||||
KubeProxyVersion: version.Get().String(),
|
||||
},
|
||||
Capacity: v1.ResourceList{
|
||||
v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI),
|
||||
v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI),
|
||||
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
|
||||
v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
|
||||
v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI),
|
||||
v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI),
|
||||
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
|
||||
},
|
||||
Allocatable: v1.ResourceList{
|
||||
v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI),
|
||||
v1.ResourceMemory: *resource.NewQuantity(9900E6, resource.BinarySI),
|
||||
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
|
||||
v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
|
||||
v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI),
|
||||
v1.ResourceMemory: *resource.NewQuantity(9900E6, resource.BinarySI),
|
||||
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
|
||||
},
|
||||
Addresses: []v1.NodeAddress{
|
||||
{Type: v1.NodeLegacyHostIP, Address: "127.0.0.1"},
|
||||
@ -402,10 +400,9 @@ func TestUpdateExistingNodeStatus(t *testing.T) {
|
||||
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
|
||||
},
|
||||
Allocatable: v1.ResourceList{
|
||||
v1.ResourceCPU: *resource.NewMilliQuantity(2800, resource.DecimalSI),
|
||||
v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI),
|
||||
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
|
||||
v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
|
||||
v1.ResourceCPU: *resource.NewMilliQuantity(2800, resource.DecimalSI),
|
||||
v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI),
|
||||
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
|
||||
},
|
||||
},
|
||||
}
|
||||
@ -482,16 +479,14 @@ func TestUpdateExistingNodeStatus(t *testing.T) {
|
||||
KubeProxyVersion: version.Get().String(),
|
||||
},
|
||||
Capacity: v1.ResourceList{
|
||||
v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI),
|
||||
v1.ResourceMemory: *resource.NewQuantity(20E9, resource.BinarySI),
|
||||
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
|
||||
v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
|
||||
v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI),
|
||||
v1.ResourceMemory: *resource.NewQuantity(20E9, resource.BinarySI),
|
||||
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
|
||||
},
|
||||
Allocatable: v1.ResourceList{
|
||||
v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI),
|
||||
v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI),
|
||||
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
|
||||
v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
|
||||
v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI),
|
||||
v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI),
|
||||
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
|
||||
},
|
||||
Addresses: []v1.NodeAddress{
|
||||
{Type: v1.NodeLegacyHostIP, Address: "127.0.0.1"},
|
||||
@ -790,16 +785,14 @@ func TestUpdateNodeStatusWithRuntimeStateError(t *testing.T) {
|
||||
KubeProxyVersion: version.Get().String(),
|
||||
},
|
||||
Capacity: v1.ResourceList{
|
||||
v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI),
|
||||
v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI),
|
||||
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
|
||||
v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
|
||||
v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI),
|
||||
v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI),
|
||||
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
|
||||
},
|
||||
Allocatable: v1.ResourceList{
|
||||
v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI),
|
||||
v1.ResourceMemory: *resource.NewQuantity(9900E6, resource.BinarySI),
|
||||
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
|
||||
v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
|
||||
v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI),
|
||||
v1.ResourceMemory: *resource.NewQuantity(9900E6, resource.BinarySI),
|
||||
v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI),
|
||||
},
|
||||
Addresses: []v1.NodeAddress{
|
||||
{Type: v1.NodeLegacyHostIP, Address: "127.0.0.1"},
|
||||
|
@ -84,20 +84,23 @@ func (kl *Kubelet) getActivePods() []*v1.Pod {
|
||||
}
|
||||
|
||||
// makeDevices determines the devices for the given container.
|
||||
// Experimental. For now, we hardcode /dev/nvidia0 no matter what the user asks for
|
||||
// (we only support one device per node).
|
||||
// TODO: add support for more than 1 GPU after #28216.
|
||||
func makeDevices(container *v1.Container) []kubecontainer.DeviceInfo {
|
||||
nvidiaGPULimit := container.Resources.Limits.NvidiaGPU()
|
||||
if nvidiaGPULimit.Value() != 0 {
|
||||
return []kubecontainer.DeviceInfo{
|
||||
{PathOnHost: "/dev/nvidia0", PathInContainer: "/dev/nvidia0", Permissions: "mrw"},
|
||||
{PathOnHost: "/dev/nvidiactl", PathInContainer: "/dev/nvidiactl", Permissions: "mrw"},
|
||||
{PathOnHost: "/dev/nvidia-uvm", PathInContainer: "/dev/nvidia-uvm", Permissions: "mrw"},
|
||||
}
|
||||
// Experimental.
|
||||
func (kl *Kubelet) makeDevices(pod *v1.Pod, container *v1.Container) ([]kubecontainer.DeviceInfo, error) {
|
||||
if container.Resources.Limits.NvidiaGPU().IsZero() {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return nil
|
||||
nvidiaGPUPaths, err := kl.gpuManager.AllocateGPU(pod, container)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var devices []kubecontainer.DeviceInfo
|
||||
for _, path := range nvidiaGPUPaths {
|
||||
// Devices have to be mapped one to one because of nvidia CUDA library requirements.
|
||||
devices = append(devices, kubecontainer.DeviceInfo{PathOnHost: path, PathInContainer: path, Permissions: "mrw"})
|
||||
}
|
||||
|
||||
return devices, nil
|
||||
}
|
||||
|
||||
// makeMounts determines the mount points for the given container.
|
||||
@ -285,7 +288,10 @@ func (kl *Kubelet) GenerateRunContainerOptions(pod *v1.Pod, container *v1.Contai
|
||||
|
||||
opts.PortMappings = kubecontainer.MakePortMappings(container)
|
||||
// TODO(random-liu): Move following convert functions into pkg/kubelet/container
|
||||
opts.Devices = makeDevices(container)
|
||||
opts.Devices, err = kl.makeDevices(pod, container)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
opts.Mounts, err = makeMounts(pod, kl.getPodDir(pod.UID), container, hostname, hostDomainName, podIP, volumes)
|
||||
if err != nil {
|
||||
|
@ -27,7 +27,6 @@ import (
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
@ -1711,39 +1710,6 @@ func TestGetHostPortConflicts(t *testing.T) {
|
||||
assert.True(t, hasHostPortConflicts(pods), "Should have port conflicts")
|
||||
}
|
||||
|
||||
func TestMakeDevices(t *testing.T) {
|
||||
testCases := []struct {
|
||||
container *v1.Container
|
||||
devices []kubecontainer.DeviceInfo
|
||||
test string
|
||||
}{
|
||||
{
|
||||
test: "no device",
|
||||
container: &v1.Container{},
|
||||
devices: nil,
|
||||
},
|
||||
{
|
||||
test: "gpu",
|
||||
container: &v1.Container{
|
||||
Resources: v1.ResourceRequirements{
|
||||
Limits: map[v1.ResourceName]resource.Quantity{
|
||||
v1.ResourceNvidiaGPU: resource.MustParse("1000"),
|
||||
},
|
||||
},
|
||||
},
|
||||
devices: []kubecontainer.DeviceInfo{
|
||||
{PathOnHost: "/dev/nvidia0", PathInContainer: "/dev/nvidia0", Permissions: "mrw"},
|
||||
{PathOnHost: "/dev/nvidiactl", PathInContainer: "/dev/nvidiactl", Permissions: "mrw"},
|
||||
{PathOnHost: "/dev/nvidia-uvm", PathInContainer: "/dev/nvidia-uvm", Permissions: "mrw"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range testCases {
|
||||
assert.Equal(t, test.devices, makeDevices(test.container), "[test %q]", test.test)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHasHostMountPVC(t *testing.T) {
|
||||
tests := map[string]struct {
|
||||
pvError error
|
||||
|
@ -49,6 +49,7 @@ import (
|
||||
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
|
||||
containertest "k8s.io/kubernetes/pkg/kubelet/container/testing"
|
||||
"k8s.io/kubernetes/pkg/kubelet/eviction"
|
||||
"k8s.io/kubernetes/pkg/kubelet/gpu"
|
||||
"k8s.io/kubernetes/pkg/kubelet/images"
|
||||
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
||||
"k8s.io/kubernetes/pkg/kubelet/network"
|
||||
@ -272,7 +273,7 @@ func newTestKubeletWithImageList(
|
||||
|
||||
kubelet.AddPodSyncLoopHandler(activeDeadlineHandler)
|
||||
kubelet.AddPodSyncHandler(activeDeadlineHandler)
|
||||
|
||||
kubelet.gpuManager = gpu.NewGPUManagerStub()
|
||||
return &TestKubelet{kubelet, fakeRuntime, mockCadvisor, fakeKubeClient, fakeMirrorClient, fakeClock, nil, plug}
|
||||
}
|
||||
|
||||
|
@ -150,7 +150,6 @@ func GetHollowKubeletConfig(
|
||||
c.MaxContainerCount = 100
|
||||
c.MaxOpenFiles = 1024
|
||||
c.MaxPerPodContainerCount = 2
|
||||
c.NvidiaGPUs = 0
|
||||
c.RegisterNode = true
|
||||
c.RegisterSchedulable = true
|
||||
c.RegistryBurst = 10
|
||||
|
@ -224,7 +224,7 @@ func (c *PodClient) WaitForErrorEventOrSuccess(pod *v1.Pod) (*v1.Event, error) {
|
||||
return ev, err
|
||||
}
|
||||
|
||||
// MatchContainerOutput gest output of a container and match expected regexp in the output.
|
||||
// MatchContainerOutput gets output of a container and match expected regexp in the output.
|
||||
func (c *PodClient) MatchContainerOutput(name string, containerName string, expectedRegexp string) error {
|
||||
f := c.f
|
||||
output, err := GetPodLogs(f.ClientSet, f.Namespace.Name, name, containerName)
|
||||
|
@ -14,6 +14,7 @@ go_library(
|
||||
"benchmark_util.go",
|
||||
"container.go",
|
||||
"doc.go",
|
||||
"gpus.go",
|
||||
"image_list.go",
|
||||
"resource_collector.go",
|
||||
"simple_mount.go",
|
||||
@ -37,12 +38,14 @@ go_library(
|
||||
"//vendor:github.com/onsi/gomega",
|
||||
"//vendor:github.com/opencontainers/runc/libcontainer/cgroups",
|
||||
"//vendor:k8s.io/apimachinery/pkg/api/errors",
|
||||
"//vendor:k8s.io/apimachinery/pkg/api/resource",
|
||||
"//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
|
||||
"//vendor:k8s.io/apimachinery/pkg/labels",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/runtime",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/sets",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/uuid",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/wait",
|
||||
"//vendor:k8s.io/client-go/pkg/api",
|
||||
],
|
||||
)
|
||||
|
||||
|
135
test/e2e_node/gpus.go
Normal file
135
test/e2e_node/gpus.go
Normal file
@ -0,0 +1,135 @@
|
||||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package e2e_node
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/client-go/pkg/api"
|
||||
"k8s.io/kubernetes/pkg/api/v1"
|
||||
"k8s.io/kubernetes/pkg/apis/componentconfig"
|
||||
"k8s.io/kubernetes/test/e2e/framework"
|
||||
|
||||
. "github.com/onsi/ginkgo"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
const acceleratorsFeatureGate = "Accelerators=true"
|
||||
|
||||
// Serial because the test updates kubelet configuration.
|
||||
var _ = framework.KubeDescribe("GPU [Serial]", func() {
|
||||
f := framework.NewDefaultFramework("gpu-test")
|
||||
Context("attempt to use GPUs if available", func() {
|
||||
It("setup the node and create pods to test gpus", func() {
|
||||
By("ensuring that dynamic kubelet configuration is enabled")
|
||||
enabled, err := isKubeletConfigEnabled(f)
|
||||
framework.ExpectNoError(err)
|
||||
if !enabled {
|
||||
Skip("Dynamic Kubelet configuration is not enabled. Skipping test.")
|
||||
}
|
||||
|
||||
By("enabling support for GPUs")
|
||||
var oldCfg *componentconfig.KubeletConfiguration
|
||||
defer func() {
|
||||
if oldCfg != nil {
|
||||
framework.ExpectNoError(setKubeletConfiguration(f, oldCfg))
|
||||
}
|
||||
}()
|
||||
|
||||
oldCfg, err = getCurrentKubeletConfig()
|
||||
framework.ExpectNoError(err)
|
||||
clone, err := api.Scheme.DeepCopy(oldCfg)
|
||||
framework.ExpectNoError(err)
|
||||
newCfg := clone.(*componentconfig.KubeletConfiguration)
|
||||
if newCfg.FeatureGates != "" {
|
||||
newCfg.FeatureGates = fmt.Sprintf("%s,%s", acceleratorsFeatureGate, newCfg.FeatureGates)
|
||||
} else {
|
||||
newCfg.FeatureGates = acceleratorsFeatureGate
|
||||
}
|
||||
framework.ExpectNoError(setKubeletConfiguration(f, newCfg))
|
||||
|
||||
By("Getting the local node object from the api server")
|
||||
nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
|
||||
framework.ExpectNoError(err, "getting node list")
|
||||
Expect(len(nodeList.Items)).To(Equal(1))
|
||||
node := nodeList.Items[0]
|
||||
gpusAvailable := node.Status.Capacity.NvidiaGPU()
|
||||
By("Skipping the test if GPUs aren't available")
|
||||
if gpusAvailable.IsZero() {
|
||||
Skip("No GPUs available on local node. Skipping test.")
|
||||
}
|
||||
|
||||
By("Creating a pod that will consume all GPUs")
|
||||
podSuccess := makePod(gpusAvailable.Value(), "gpus-success")
|
||||
podSuccess = f.PodClient().CreateSync(podSuccess)
|
||||
|
||||
By("Checking if the pod outputted Success to its logs")
|
||||
framework.ExpectNoError(f.PodClient().MatchContainerOutput(podSuccess.Name, podSuccess.Name, "Success"))
|
||||
|
||||
By("Creating a new pod requesting a GPU and noticing that it is rejected by the Kubelet")
|
||||
podFailure := makePod(1, "gpu-failure")
|
||||
framework.WaitForPodCondition(f.ClientSet, f.Namespace.Name, podFailure.Name, "pod rejected", framework.PodStartTimeout, func(pod *v1.Pod) (bool, error) {
|
||||
if pod.Status.Phase == v1.PodFailed {
|
||||
return true, nil
|
||||
|
||||
}
|
||||
return false, nil
|
||||
})
|
||||
|
||||
By("stopping the original Pod with GPUs")
|
||||
gp := int64(0)
|
||||
deleteOptions := metav1.DeleteOptions{
|
||||
GracePeriodSeconds: &gp,
|
||||
}
|
||||
f.PodClient().DeleteSync(podSuccess.Name, &deleteOptions, 30*time.Second)
|
||||
|
||||
By("attempting to start the failed pod again")
|
||||
f.PodClient().DeleteSync(podFailure.Name, &deleteOptions, 10*time.Second)
|
||||
podFailure = f.PodClient().CreateSync(podFailure)
|
||||
|
||||
By("Checking if the pod outputted Success to its logs")
|
||||
framework.ExpectNoError(f.PodClient().MatchContainerOutput(podFailure.Name, podFailure.Name, "Success"))
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
func makePod(gpus int64, name string) *v1.Pod {
|
||||
resources := v1.ResourceRequirements{
|
||||
Limits: v1.ResourceList{
|
||||
v1.ResourceNvidiaGPU: *resource.NewQuantity(gpus, resource.DecimalSI),
|
||||
},
|
||||
}
|
||||
gpuverificationCmd := fmt.Sprintf("if [[ %d -ne $(ls /dev/ | egrep '^nvidia[0-9]+$') ]]; then exit 1; fi; echo Success; sleep 10240 ", gpus)
|
||||
return &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: name,
|
||||
},
|
||||
Spec: v1.PodSpec{
|
||||
Containers: []v1.Container{
|
||||
{
|
||||
Image: "gcr.io/google_containers/busybox:1.24",
|
||||
Name: name,
|
||||
Command: []string{"sh", "-c", gpuverificationCmd},
|
||||
Resources: resources,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user