use active pods instead of runtime pods in gpu manager

Signed-off-by: Vishnu kannan <vishnuk@google.com>
This commit is contained in:
Vishnu kannan 2017-03-11 10:43:24 -08:00
parent 8ed9bff073
commit ff158090b3
4 changed files with 13 additions and 22 deletions

View File

@ -48,7 +48,7 @@ const (
type activePodsLister interface { type activePodsLister interface {
// Returns a list of active pods on the node. // Returns a list of active pods on the node.
GetRunningPods() ([]*v1.Pod, error) GetActivePods() []*v1.Pod
} }
// nvidiaGPUManager manages nvidia gpu devices. // nvidiaGPUManager manages nvidia gpu devices.
@ -148,9 +148,7 @@ func (ngm *nvidiaGPUManager) AllocateGPU(pod *v1.Pod, container *v1.Container) (
ngm.allocated = allocated ngm.allocated = allocated
} else { } else {
// update internal list of GPUs in use prior to allocating new GPUs. // update internal list of GPUs in use prior to allocating new GPUs.
if err := ngm.updateAllocatedGPUs(); err != nil { ngm.updateAllocatedGPUs()
return nil, fmt.Errorf("Failed to allocate GPUs because of issues with updating GPUs in use: %v", err)
}
} }
// Check if GPUs have already been allocated. If so return them right away. // Check if GPUs have already been allocated. If so return them right away.
// This can happen if a container restarts for example. // This can happen if a container restarts for example.
@ -179,13 +177,10 @@ func (ngm *nvidiaGPUManager) AllocateGPU(pod *v1.Pod, container *v1.Container) (
} }
// updateAllocatedGPUs updates the list of GPUs in use. // updateAllocatedGPUs updates the list of GPUs in use.
// It gets a list of running pods and then frees any GPUs that are bound to terminated pods. // It gets a list of active pods and then frees any GPUs that are bound to terminated pods.
// Returns error on failure. // Returns error on failure.
func (ngm *nvidiaGPUManager) updateAllocatedGPUs() error { func (ngm *nvidiaGPUManager) updateAllocatedGPUs() {
activePods, err := ngm.activePodsLister.GetRunningPods() activePods := ngm.activePodsLister.GetActivePods()
if err != nil {
return fmt.Errorf("Failed to list active pods: %v", err)
}
activePodUids := sets.NewString() activePodUids := sets.NewString()
for _, pod := range activePods { for _, pod := range activePods {
activePodUids.Insert(string(pod.UID)) activePodUids.Insert(string(pod.UID))
@ -194,7 +189,6 @@ func (ngm *nvidiaGPUManager) updateAllocatedGPUs() error {
podsToBeRemoved := allocatedPodUids.Difference(activePodUids) podsToBeRemoved := allocatedPodUids.Difference(activePodUids)
glog.V(5).Infof("pods to be removed: %v", podsToBeRemoved.List()) glog.V(5).Infof("pods to be removed: %v", podsToBeRemoved.List())
ngm.allocated.delete(podsToBeRemoved.List()) ngm.allocated.delete(podsToBeRemoved.List())
return nil
} }
// discoverGPUs identifies allGPUs NVIDIA GPU devices available on the local node by walking `/dev` directory. // discoverGPUs identifies allGPUs NVIDIA GPU devices available on the local node by walking `/dev` directory.
@ -224,10 +218,7 @@ func (ngm *nvidiaGPUManager) discoverGPUs() error {
// gpusInUse returns a list of GPUs in use along with the respective pods that are using it. // gpusInUse returns a list of GPUs in use along with the respective pods that are using it.
func (ngm *nvidiaGPUManager) gpusInUse() (*podGPUs, error) { func (ngm *nvidiaGPUManager) gpusInUse() (*podGPUs, error) {
pods, err := ngm.activePodsLister.GetRunningPods() pods := ngm.activePodsLister.GetActivePods()
if err != nil {
return nil, err
}
type containerIdentifier struct { type containerIdentifier struct {
id string id string
name string name string

View File

@ -32,8 +32,8 @@ type testActivePodsLister struct {
activePods []*v1.Pod activePods []*v1.Pod
} }
func (tapl *testActivePodsLister) GetRunningPods() ([]*v1.Pod, error) { func (tapl *testActivePodsLister) GetActivePods() []*v1.Pod {
return tapl.activePods, nil return tapl.activePods
} }
func makeTestPod(numContainers, gpusPerContainer int) *v1.Pod { func makeTestPod(numContainers, gpusPerContainer int) *v1.Pod {

View File

@ -792,7 +792,7 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub
klet.AddPodSyncLoopHandler(activeDeadlineHandler) klet.AddPodSyncLoopHandler(activeDeadlineHandler)
klet.AddPodSyncHandler(activeDeadlineHandler) klet.AddPodSyncHandler(activeDeadlineHandler)
criticalPodAdmissionHandler := preemption.NewCriticalPodAdmissionHandler(klet.getActivePods, killPodNow(klet.podWorkers, kubeDeps.Recorder), kubeDeps.Recorder) criticalPodAdmissionHandler := preemption.NewCriticalPodAdmissionHandler(klet.GetActivePods, killPodNow(klet.podWorkers, kubeDeps.Recorder), kubeDeps.Recorder)
klet.admitHandlers.AddPodAdmitHandler(lifecycle.NewPredicateAdmitHandler(klet.getNodeAnyWay, criticalPodAdmissionHandler)) klet.admitHandlers.AddPodAdmitHandler(lifecycle.NewPredicateAdmitHandler(klet.getNodeAnyWay, criticalPodAdmissionHandler))
// apply functional Option's // apply functional Option's
for _, opt := range kubeDeps.Options { for _, opt := range kubeDeps.Options {
@ -1204,7 +1204,7 @@ func (kl *Kubelet) initializeModules() error {
return fmt.Errorf("Kubelet failed to get node info: %v", err) return fmt.Errorf("Kubelet failed to get node info: %v", err)
} }
if err := kl.containerManager.Start(node, kl.getActivePods); err != nil { if err := kl.containerManager.Start(node, kl.GetActivePods); err != nil {
return fmt.Errorf("Failed to start ContainerManager %v", err) return fmt.Errorf("Failed to start ContainerManager %v", err)
} }
@ -1230,7 +1230,7 @@ func (kl *Kubelet) initializeRuntimeDependentModules() {
glog.Fatalf("Failed to start cAdvisor %v", err) glog.Fatalf("Failed to start cAdvisor %v", err)
} }
// eviction manager must start after cadvisor because it needs to know if the container runtime has a dedicated imagefs // eviction manager must start after cadvisor because it needs to know if the container runtime has a dedicated imagefs
kl.evictionManager.Start(kl, kl.getActivePods, kl, evictionMonitoringPeriod) kl.evictionManager.Start(kl, kl.GetActivePods, kl, evictionMonitoringPeriod)
} }
// Run starts the kubelet reacting to config updates // Run starts the kubelet reacting to config updates

View File

@ -76,8 +76,8 @@ func (kl *Kubelet) listPodsFromDisk() ([]types.UID, error) {
return pods, nil return pods, nil
} }
// getActivePods returns non-terminal pods // GetActivePods returns non-terminal pods
func (kl *Kubelet) getActivePods() []*v1.Pod { func (kl *Kubelet) GetActivePods() []*v1.Pod {
allPods := kl.podManager.GetPods() allPods := kl.podManager.GetPods()
activePods := kl.filterOutTerminatedPods(allPods) activePods := kl.filterOutTerminatedPods(allPods)
return activePods return activePods