diff --git a/pkg/kubelet/cm/BUILD b/pkg/kubelet/cm/BUILD index dddc64123cd..42cadd854b3 100644 --- a/pkg/kubelet/cm/BUILD +++ b/pkg/kubelet/cm/BUILD @@ -19,6 +19,7 @@ go_library( "node_container_manager.go", "pod_container_manager_linux.go", "pod_container_manager_stub.go", + "qos_container_manager_linux.go", "types.go", ], tags = ["automanaged"], diff --git a/pkg/kubelet/cm/container_manager.go b/pkg/kubelet/cm/container_manager.go index 75228bf3f0e..b2c6f8fc4e8 100644 --- a/pkg/kubelet/cm/container_manager.go +++ b/pkg/kubelet/cm/container_manager.go @@ -23,12 +23,14 @@ import ( evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" ) +type ActivePodsFunc func() []*v1.Pod + // Manages the containers running on a machine. type ContainerManager interface { // Runs the container manager's housekeeping. // - Ensures that the Docker daemon is in a container. // - Creates the system container where all non-containerized processes run. - Start(*v1.Node) error + Start(*v1.Node, ActivePodsFunc) error // Returns resources allocated to system cgroups in the machine. // These cgroups include the system and Kubernetes services. @@ -52,6 +54,10 @@ type ContainerManager interface { // GetNodeAllocatable returns the amount of compute resources that have to be reserved from scheduling. GetNodeAllocatableReservation() v1.ResourceList + + // UpdateQOSCgroups performs housekeeping updates to ensure that the top + // level QoS containers have their desired state in a thread-safe way + UpdateQOSCgroups() error } type NodeConfig struct { diff --git a/pkg/kubelet/cm/container_manager_linux.go b/pkg/kubelet/cm/container_manager_linux.go index 4cd4d9b45a9..d613910d48d 100644 --- a/pkg/kubelet/cm/container_manager_linux.go +++ b/pkg/kubelet/cm/container_manager_linux.go @@ -116,6 +116,8 @@ type containerManagerImpl struct { cgroupRoot string // Event recorder interface. recorder record.EventRecorder + // Interface for QoS cgroup management + qosContainerManager QOSContainerManager } type features struct { @@ -246,15 +248,22 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I cgroupRoot = path.Join(cgroupRoot, defaultNodeAllocatableCgroupName) } glog.Infof("Creating Container Manager object based on Node Config: %+v", nodeConfig) + + qosContainerManager, err := NewQOSContainerManager(subsystems, cgroupRoot, nodeConfig) + if err != nil { + return nil, err + } + return &containerManagerImpl{ - cadvisorInterface: cadvisorInterface, - mountUtil: mountUtil, - NodeConfig: nodeConfig, - subsystems: subsystems, - cgroupManager: cgroupManager, - capacity: capacity, - cgroupRoot: cgroupRoot, - recorder: recorder, + cadvisorInterface: cadvisorInterface, + mountUtil: mountUtil, + NodeConfig: nodeConfig, + subsystems: subsystems, + cgroupManager: cgroupManager, + capacity: capacity, + cgroupRoot: cgroupRoot, + recorder: recorder, + qosContainerManager: qosContainerManager, }, nil } @@ -264,7 +273,7 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I func (cm *containerManagerImpl) NewPodContainerManager() PodContainerManager { if cm.NodeConfig.CgroupsPerQOS { return &podContainerManagerImpl{ - qosContainersInfo: cm.qosContainers, + qosContainersInfo: cm.GetQOSContainersInfo(), nodeInfo: cm.nodeInfo, subsystems: cm.subsystems, cgroupManager: cm.cgroupManager, @@ -297,53 +306,6 @@ const ( KernelTunableModify KernelTunableBehavior = "modify" ) -// InitQOS creates the top level qos cgroup containers -// We create top level QoS containers for only Burstable and Best Effort -// and not Guaranteed QoS class. All guaranteed pods are nested under the -// RootContainer by default. InitQOS is called only once during kubelet bootstrapping. -func InitQOS(cgroupDriver, rootContainer string, subsystems *CgroupSubsystems) (QOSContainersInfo, error) { - cm := NewCgroupManager(subsystems, cgroupDriver) - // Top level for Qos containers are created only for Burstable - // and Best Effort classes - qosClasses := [2]v1.PodQOSClass{v1.PodQOSBurstable, v1.PodQOSBestEffort} - - // Create containers for both qos classes - for _, qosClass := range qosClasses { - // get the container's absolute name - absoluteContainerName := CgroupName(path.Join(rootContainer, string(qosClass))) - - resourceParameters := &ResourceConfig{} - // the BestEffort QoS class has a statically configured minShares value - if qosClass == v1.PodQOSBestEffort { - minShares := int64(MinShares) - resourceParameters.CpuShares = &minShares - } - // containerConfig object stores the cgroup specifications - containerConfig := &CgroupConfig{ - Name: absoluteContainerName, - ResourceParameters: resourceParameters, - } - // check if it exists - if !cm.Exists(absoluteContainerName) { - if err := cm.Create(containerConfig); err != nil { - return QOSContainersInfo{}, fmt.Errorf("failed to create top level %v QOS cgroup : %v", qosClass, err) - } - } else { - // to ensure we actually have the right state, we update the config on startup - if err := cm.Update(containerConfig); err != nil { - return QOSContainersInfo{}, fmt.Errorf("failed to update top level %v QOS cgroup : %v", qosClass, err) - } - } - } - // Store the top level qos container names - qosContainersInfo := QOSContainersInfo{ - Guaranteed: rootContainer, - Burstable: path.Join(rootContainer, string(v1.PodQOSBurstable)), - BestEffort: path.Join(rootContainer, string(v1.PodQOSBestEffort)), - } - return qosContainersInfo, nil -} - // setupKernelTunables validates kernel tunable flags are set as expected // depending upon the specified option, it will either warn, error, or modify the kernel tunable flags func setupKernelTunables(option KernelTunableBehavior) error { @@ -383,7 +345,7 @@ func setupKernelTunables(option KernelTunableBehavior) error { return utilerrors.NewAggregate(errList) } -func (cm *containerManagerImpl) setupNode() error { +func (cm *containerManagerImpl) setupNode(activePods ActivePodsFunc) error { f, err := validateSystemRequirements(cm.mountUtil) if err != nil { return err @@ -404,11 +366,10 @@ func (cm *containerManagerImpl) setupNode() error { if err := cm.createNodeAllocatableCgroups(); err != nil { return err } - qosContainersInfo, err := InitQOS(cm.NodeConfig.CgroupDriver, cm.cgroupRoot, cm.subsystems) + err = cm.qosContainerManager.Start(cm.nodeInfo, activePods) if err != nil { return fmt.Errorf("failed to initialise top level QOS containers: %v", err) } - cm.qosContainers = qosContainersInfo } // Enforce Node Allocatable (if required) @@ -559,7 +520,11 @@ func (cm *containerManagerImpl) GetMountedSubsystems() *CgroupSubsystems { } func (cm *containerManagerImpl) GetQOSContainersInfo() QOSContainersInfo { - return cm.qosContainers + return cm.qosContainerManager.GetQOSContainersInfo() +} + +func (cm *containerManagerImpl) UpdateQOSCgroups() error { + return cm.qosContainerManager.UpdateCgroups() } func (cm *containerManagerImpl) Status() Status { @@ -568,12 +533,12 @@ func (cm *containerManagerImpl) Status() Status { return cm.status } -func (cm *containerManagerImpl) Start(node *v1.Node) error { +func (cm *containerManagerImpl) Start(node *v1.Node, activePods ActivePodsFunc) error { // cache the node Info including resource capacity and // allocatable of the node cm.nodeInfo = node // Setup the node - if err := cm.setupNode(); err != nil { + if err := cm.setupNode(activePods); err != nil { return err } // Ensure that node allocatable configuration is valid. diff --git a/pkg/kubelet/cm/container_manager_stub.go b/pkg/kubelet/cm/container_manager_stub.go index fb38b217dd9..1ca59ced263 100644 --- a/pkg/kubelet/cm/container_manager_stub.go +++ b/pkg/kubelet/cm/container_manager_stub.go @@ -25,7 +25,7 @@ type containerManagerStub struct{} var _ ContainerManager = &containerManagerStub{} -func (cm *containerManagerStub) Start(_ *v1.Node) error { +func (cm *containerManagerStub) Start(_ *v1.Node, _ ActivePodsFunc) error { glog.V(2).Infof("Starting stub container manager") return nil } @@ -46,6 +46,10 @@ func (cm *containerManagerStub) GetQOSContainersInfo() QOSContainersInfo { return QOSContainersInfo{} } +func (cm *containerManagerStub) UpdateQOSCgroups() error { + return nil +} + func (cm *containerManagerStub) Status() Status { return Status{} } diff --git a/pkg/kubelet/cm/container_manager_unsupported.go b/pkg/kubelet/cm/container_manager_unsupported.go index b8dcfb0d31a..66e20a9dc07 100644 --- a/pkg/kubelet/cm/container_manager_unsupported.go +++ b/pkg/kubelet/cm/container_manager_unsupported.go @@ -32,7 +32,7 @@ type unsupportedContainerManager struct { var _ ContainerManager = &unsupportedContainerManager{} -func (unsupportedContainerManager) Start(_ *v1.Node) error { +func (unsupportedContainerManager) Start(_ *v1.Node, _ ActivePodsFunc) error { return fmt.Errorf("Container Manager is unsupported in this build") } @@ -52,6 +52,10 @@ func (unsupportedContainerManager) GetQOSContainersInfo() QOSContainersInfo { return QOSContainersInfo{} } +func (unsupportedContainerManager) UpdateQOSCgroups() error { + return nil +} + func (cm *unsupportedContainerManager) Status() Status { return Status{} } diff --git a/pkg/kubelet/cm/container_manager_windows.go b/pkg/kubelet/cm/container_manager_windows.go index 8a90fc2de9e..25949894b45 100644 --- a/pkg/kubelet/cm/container_manager_windows.go +++ b/pkg/kubelet/cm/container_manager_windows.go @@ -33,7 +33,7 @@ type containerManagerImpl struct { var _ ContainerManager = &containerManagerImpl{} -func (cm *containerManagerImpl) Start(_ *v1.Node) error { +func (cm *containerManagerImpl) Start(_ *v1.Node, _ ActivePodsFunc) error { glog.V(2).Infof("Starting Windows stub container manager") return nil } diff --git a/pkg/kubelet/cm/qos_container_manager_linux.go b/pkg/kubelet/cm/qos_container_manager_linux.go new file mode 100644 index 00000000000..621fac15f6a --- /dev/null +++ b/pkg/kubelet/cm/qos_container_manager_linux.go @@ -0,0 +1,137 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cm + +import ( + "fmt" + "path" + "sync" + + "k8s.io/kubernetes/pkg/api/v1" +) + +type QOSContainerManager interface { + Start(*v1.Node, ActivePodsFunc) error + GetQOSContainersInfo() QOSContainersInfo + UpdateCgroups() error +} + +type qosContainerManagerImpl struct { + sync.Mutex + nodeInfo *v1.Node + qosContainersInfo QOSContainersInfo + subsystems *CgroupSubsystems + cgroupManager CgroupManager + activePods ActivePodsFunc + cgroupRoot string +} + +func NewQOSContainerManager(subsystems *CgroupSubsystems, cgroupRoot string, nodeConfig NodeConfig) (QOSContainerManager, error) { + if !nodeConfig.CgroupsPerQOS { + return &qosContainerManagerNoop{ + cgroupRoot: CgroupName(nodeConfig.CgroupRoot), + }, nil + } + + return &qosContainerManagerImpl{ + subsystems: subsystems, + cgroupManager: NewCgroupManager(subsystems, nodeConfig.CgroupDriver), + cgroupRoot: cgroupRoot, + }, nil +} + +func (m *qosContainerManagerImpl) GetQOSContainersInfo() QOSContainersInfo { + return m.qosContainersInfo +} + +func (m *qosContainerManagerImpl) Start(nodeInfo *v1.Node, activePods ActivePodsFunc) error { + cm := m.cgroupManager + rootContainer := m.cgroupRoot + if !cm.Exists(CgroupName(rootContainer)) { + return fmt.Errorf("root container %s doesn't exist", rootContainer) + } + + // Top level for Qos containers are created only for Burstable + // and Best Effort classes + qosClasses := [2]v1.PodQOSClass{v1.PodQOSBurstable, v1.PodQOSBestEffort} + + // Create containers for both qos classes + for _, qosClass := range qosClasses { + // get the container's absolute name + absoluteContainerName := CgroupName(path.Join(rootContainer, string(qosClass))) + + resourceParameters := &ResourceConfig{} + // the BestEffort QoS class has a statically configured minShares value + if qosClass == v1.PodQOSBestEffort { + minShares := int64(MinShares) + resourceParameters.CpuShares = &minShares + } + // containerConfig object stores the cgroup specifications + containerConfig := &CgroupConfig{ + Name: absoluteContainerName, + ResourceParameters: resourceParameters, + } + // check if it exists + if !cm.Exists(absoluteContainerName) { + if err := cm.Create(containerConfig); err != nil { + return fmt.Errorf("failed to create top level %v QOS cgroup : %v", qosClass, err) + } + } else { + // to ensure we actually have the right state, we update the config on startup + if err := cm.Update(containerConfig); err != nil { + return fmt.Errorf("failed to update top level %v QOS cgroup : %v", qosClass, err) + } + } + } + // Store the top level qos container names + m.qosContainersInfo = QOSContainersInfo{ + Guaranteed: rootContainer, + Burstable: path.Join(rootContainer, string(v1.PodQOSBurstable)), + BestEffort: path.Join(rootContainer, string(v1.PodQOSBestEffort)), + } + m.nodeInfo = nodeInfo + m.activePods = activePods + + return nil +} + +func (m *qosContainerManagerImpl) UpdateCgroups() error { + m.Lock() + defer m.Unlock() + + // TODO: Update cgroups + + return nil +} + +type qosContainerManagerNoop struct { + cgroupRoot CgroupName +} + +var _ QOSContainerManager = &qosContainerManagerNoop{} + +func (m *qosContainerManagerNoop) GetQOSContainersInfo() QOSContainersInfo { + return QOSContainersInfo{} +} + +func (m *qosContainerManagerNoop) Start(_ *v1.Node, _ ActivePodsFunc) error { + return nil +} + +func (m *qosContainerManagerNoop) UpdateCgroups() error { + return nil +} diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 878c9556506..b89e7fcd96e 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -1173,7 +1173,7 @@ func (kl *Kubelet) initializeModules() error { return fmt.Errorf("Kubelet failed to get node info: %v", err) } - if err := kl.containerManager.Start(node); err != nil { + if err := kl.containerManager.Start(node, kl.getActivePods); err != nil { return fmt.Errorf("Failed to start ContainerManager %v", err) } @@ -1468,8 +1468,13 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error { // they are not expected to run again. // We don't create and apply updates to cgroup if its a run once pod and was killed above if !(podKilled && pod.Spec.RestartPolicy == v1.RestartPolicyNever) { - if err := pcm.EnsureExists(pod); err != nil { - return fmt.Errorf("failed to ensure that the pod: %v cgroups exist and are correctly applied: %v", pod.UID, err) + if !pcm.Exists(pod) { + if err := kl.containerManager.UpdateQOSCgroups(); err != nil { + glog.V(2).Infof("Failed to update QoS cgroups while syncing pod: %v", err) + } + if err := pcm.EnsureExists(pod); err != nil { + return fmt.Errorf("failed to ensure that the pod: %v cgroups exist and are correctly applied: %v", pod.UID, err) + } } } } diff --git a/pkg/kubelet/kubelet_pods.go b/pkg/kubelet/kubelet_pods.go index 453d1d57bc9..8cdbe3ae138 100644 --- a/pkg/kubelet/kubelet_pods.go +++ b/pkg/kubelet/kubelet_pods.go @@ -654,6 +654,9 @@ func (kl *Kubelet) killPod(pod *v1.Pod, runningPod *kubecontainer.Pod, status *k glog.Warningf("Failed to reduce the CPU values to the minimum amount of shares: %v", err) } } + if err := kl.containerManager.UpdateQOSCgroups(); err != nil { + glog.V(2).Infof("Failed to update QoS cgroups while killing pod: %v", err) + } return nil }