
The slice of strings more precisely captures the hierarchic nature of the cgroup paths we use to represent pods and their groupings. It also ensures we're reducing the chances of passing an incorrect path format to a cgroup driver that requires a different path naming, since now explicit conversions are always needed. The new constructor NewCgroupName starts from an existing CgroupName, which enforces a hierarchy where a root is always needed. It also performs checking on the component names to ensure invalid characters ("/" and "_") are not in use. A RootCgroupName for the top of the cgroup hierarchy tree is introduced. This refactor results in a net reduction of around 30 lines of code, mainly with the demise of ConvertCgroupNameToSystemd which had fairly complicated logic in it and was doing just too many things. There's a small TODO in a helper updateSystemdCgroupInfo that was introduced to make this commit possible. That logic really belongs in libcontainer, I'm planning to send a PR there to include it there. (The API already takes a field with that information, only that field is only processed in cgroupfs and not systemd driver, we should fix that.) Tested by running the e2e-node tests on both Ubuntu 16.04 (with cgroupfs driver) and CentOS 7 (with systemd driver.)
281 lines
10 KiB
Go
281 lines
10 KiB
Go
/*
|
|
Copyright 2016 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package cm
|
|
|
|
import (
|
|
"fmt"
|
|
"io/ioutil"
|
|
"os"
|
|
"path"
|
|
"strings"
|
|
|
|
"github.com/golang/glog"
|
|
"k8s.io/api/core/v1"
|
|
"k8s.io/apimachinery/pkg/types"
|
|
utilerrors "k8s.io/apimachinery/pkg/util/errors"
|
|
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
|
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
|
|
kubefeatures "k8s.io/kubernetes/pkg/features"
|
|
)
|
|
|
|
const (
|
|
podCgroupNamePrefix = "pod"
|
|
)
|
|
|
|
// podContainerManagerImpl implements podContainerManager interface.
|
|
// It is the general implementation which allows pod level container
|
|
// management if qos Cgroup is enabled.
|
|
type podContainerManagerImpl struct {
|
|
// qosContainersInfo hold absolute paths of the top level qos containers
|
|
qosContainersInfo QOSContainersInfo
|
|
// Stores the mounted cgroup subsystems
|
|
subsystems *CgroupSubsystems
|
|
// cgroupManager is the cgroup Manager Object responsible for managing all
|
|
// pod cgroups.
|
|
cgroupManager CgroupManager
|
|
// Maximum number of pids in a pod
|
|
podPidsLimit int64
|
|
// enforceCPULimits controls whether cfs quota is enforced or not
|
|
enforceCPULimits bool
|
|
}
|
|
|
|
// Make sure that podContainerManagerImpl implements the PodContainerManager interface
|
|
var _ PodContainerManager = &podContainerManagerImpl{}
|
|
|
|
// applyLimits sets pod cgroup resource limits
|
|
// It also updates the resource limits on top level qos containers.
|
|
func (m *podContainerManagerImpl) applyLimits(pod *v1.Pod) error {
|
|
// This function will house the logic for setting the resource parameters
|
|
// on the pod container config and updating top level qos container configs
|
|
return nil
|
|
}
|
|
|
|
// Exists checks if the pod's cgroup already exists
|
|
func (m *podContainerManagerImpl) Exists(pod *v1.Pod) bool {
|
|
podContainerName, _ := m.GetPodContainerName(pod)
|
|
return m.cgroupManager.Exists(podContainerName)
|
|
}
|
|
|
|
// EnsureExists takes a pod as argument and makes sure that
|
|
// pod cgroup exists if qos cgroup hierarchy flag is enabled.
|
|
// If the pod level container doesn't already exist it is created.
|
|
func (m *podContainerManagerImpl) EnsureExists(pod *v1.Pod) error {
|
|
podContainerName, _ := m.GetPodContainerName(pod)
|
|
// check if container already exist
|
|
alreadyExists := m.Exists(pod)
|
|
if !alreadyExists {
|
|
// Create the pod container
|
|
containerConfig := &CgroupConfig{
|
|
Name: podContainerName,
|
|
ResourceParameters: ResourceConfigForPod(pod, m.enforceCPULimits),
|
|
}
|
|
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && m.podPidsLimit > 0 {
|
|
containerConfig.ResourceParameters.PodPidsLimit = &m.podPidsLimit
|
|
}
|
|
if err := m.cgroupManager.Create(containerConfig); err != nil {
|
|
return fmt.Errorf("failed to create container for %v : %v", podContainerName, err)
|
|
}
|
|
}
|
|
// Apply appropriate resource limits on the pod container
|
|
// Top level qos containers limits are not updated
|
|
// until we figure how to maintain the desired state in the kubelet.
|
|
// Because maintaining the desired state is difficult without checkpointing.
|
|
if err := m.applyLimits(pod); err != nil {
|
|
return fmt.Errorf("failed to apply resource limits on container for %v : %v", podContainerName, err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// GetPodContainerName returns the CgroupName identifier, and its literal cgroupfs form on the host.
|
|
func (m *podContainerManagerImpl) GetPodContainerName(pod *v1.Pod) (CgroupName, string) {
|
|
podQOS := v1qos.GetPodQOS(pod)
|
|
// Get the parent QOS container name
|
|
var parentContainer CgroupName
|
|
switch podQOS {
|
|
case v1.PodQOSGuaranteed:
|
|
parentContainer = m.qosContainersInfo.Guaranteed
|
|
case v1.PodQOSBurstable:
|
|
parentContainer = m.qosContainersInfo.Burstable
|
|
case v1.PodQOSBestEffort:
|
|
parentContainer = m.qosContainersInfo.BestEffort
|
|
}
|
|
podContainer := GetPodCgroupNameSuffix(pod.UID)
|
|
|
|
// Get the absolute path of the cgroup
|
|
cgroupName := NewCgroupName(parentContainer, podContainer)
|
|
// Get the literal cgroupfs name
|
|
cgroupfsName := m.cgroupManager.Name(cgroupName)
|
|
|
|
return cgroupName, cgroupfsName
|
|
}
|
|
|
|
// Scan through the whole cgroup directory and kill all processes either
|
|
// attached to the pod cgroup or to a container cgroup under the pod cgroup
|
|
func (m *podContainerManagerImpl) tryKillingCgroupProcesses(podCgroup CgroupName) error {
|
|
pidsToKill := m.cgroupManager.Pids(podCgroup)
|
|
// No pids charged to the terminated pod cgroup return
|
|
if len(pidsToKill) == 0 {
|
|
return nil
|
|
}
|
|
|
|
var errlist []error
|
|
// os.Kill often errors out,
|
|
// We try killing all the pids multiple times
|
|
for i := 0; i < 5; i++ {
|
|
if i != 0 {
|
|
glog.V(3).Infof("Attempt %v failed to kill all unwanted process. Retyring", i)
|
|
}
|
|
errlist = []error{}
|
|
for _, pid := range pidsToKill {
|
|
p, err := os.FindProcess(pid)
|
|
if err != nil {
|
|
// Process not running anymore, do nothing
|
|
continue
|
|
}
|
|
glog.V(3).Infof("Attempt to kill process with pid: %v", pid)
|
|
if err := p.Kill(); err != nil {
|
|
glog.V(3).Infof("failed to kill process with pid: %v", pid)
|
|
errlist = append(errlist, err)
|
|
}
|
|
}
|
|
if len(errlist) == 0 {
|
|
glog.V(3).Infof("successfully killed all unwanted processes.")
|
|
return nil
|
|
}
|
|
}
|
|
return utilerrors.NewAggregate(errlist)
|
|
}
|
|
|
|
// Destroy destroys the pod container cgroup paths
|
|
func (m *podContainerManagerImpl) Destroy(podCgroup CgroupName) error {
|
|
// Try killing all the processes attached to the pod cgroup
|
|
if err := m.tryKillingCgroupProcesses(podCgroup); err != nil {
|
|
glog.V(3).Infof("failed to kill all the processes attached to the %v cgroups", podCgroup)
|
|
return fmt.Errorf("failed to kill all the processes attached to the %v cgroups : %v", podCgroup, err)
|
|
}
|
|
|
|
// Now its safe to remove the pod's cgroup
|
|
containerConfig := &CgroupConfig{
|
|
Name: podCgroup,
|
|
ResourceParameters: &ResourceConfig{},
|
|
}
|
|
if err := m.cgroupManager.Destroy(containerConfig); err != nil {
|
|
return fmt.Errorf("failed to delete cgroup paths for %v : %v", podCgroup, err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// ReduceCPULimits reduces the CPU CFS values to the minimum amount of shares.
|
|
func (m *podContainerManagerImpl) ReduceCPULimits(podCgroup CgroupName) error {
|
|
return m.cgroupManager.ReduceCPULimits(podCgroup)
|
|
}
|
|
|
|
// GetAllPodsFromCgroups scans through all the subsystems of pod cgroups
|
|
// Get list of pods whose cgroup still exist on the cgroup mounts
|
|
func (m *podContainerManagerImpl) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) {
|
|
// Map for storing all the found pods on the disk
|
|
foundPods := make(map[types.UID]CgroupName)
|
|
qosContainersList := [3]CgroupName{m.qosContainersInfo.BestEffort, m.qosContainersInfo.Burstable, m.qosContainersInfo.Guaranteed}
|
|
// Scan through all the subsystem mounts
|
|
// and through each QoS cgroup directory for each subsystem mount
|
|
// If a pod cgroup exists in even a single subsystem mount
|
|
// we will attempt to delete it
|
|
for _, val := range m.subsystems.MountPoints {
|
|
for _, qosContainerName := range qosContainersList {
|
|
// get the subsystems QoS cgroup absolute name
|
|
qcConversion := m.cgroupManager.Name(qosContainerName)
|
|
qc := path.Join(val, qcConversion)
|
|
dirInfo, err := ioutil.ReadDir(qc)
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
continue
|
|
}
|
|
return nil, fmt.Errorf("failed to read the cgroup directory %v : %v", qc, err)
|
|
}
|
|
for i := range dirInfo {
|
|
// its not a directory, so continue on...
|
|
if !dirInfo[i].IsDir() {
|
|
continue
|
|
}
|
|
// convert the concrete cgroupfs name back to an internal identifier
|
|
// this is needed to handle path conversion for systemd environments.
|
|
// we pass the fully qualified path so decoding can work as expected
|
|
// since systemd encodes the path in each segment.
|
|
cgroupfsPath := path.Join(qcConversion, dirInfo[i].Name())
|
|
internalPath := m.cgroupManager.CgroupName(cgroupfsPath)
|
|
// we only care about base segment of the converted path since that
|
|
// is what we are reading currently to know if it is a pod or not.
|
|
basePath := internalPath[len(internalPath)-1]
|
|
if !strings.Contains(basePath, podCgroupNamePrefix) {
|
|
continue
|
|
}
|
|
// we then split the name on the pod prefix to determine the uid
|
|
parts := strings.Split(basePath, podCgroupNamePrefix)
|
|
// the uid is missing, so we log the unexpected cgroup not of form pod<uid>
|
|
if len(parts) != 2 {
|
|
glog.Errorf("pod cgroup manager ignoring unexpected cgroup %v because it is not a pod", cgroupfsPath)
|
|
continue
|
|
}
|
|
podUID := parts[1]
|
|
foundPods[types.UID(podUID)] = internalPath
|
|
}
|
|
}
|
|
}
|
|
return foundPods, nil
|
|
}
|
|
|
|
// podContainerManagerNoop implements podContainerManager interface.
|
|
// It is a no-op implementation and basically does nothing
|
|
// podContainerManagerNoop is used in case the QoS cgroup Hierarchy is not
|
|
// enabled, so Exists() returns true always as the cgroupRoot
|
|
// is expected to always exist.
|
|
type podContainerManagerNoop struct {
|
|
cgroupRoot CgroupName
|
|
}
|
|
|
|
// Make sure that podContainerManagerStub implements the PodContainerManager interface
|
|
var _ PodContainerManager = &podContainerManagerNoop{}
|
|
|
|
func (m *podContainerManagerNoop) Exists(_ *v1.Pod) bool {
|
|
return true
|
|
}
|
|
|
|
func (m *podContainerManagerNoop) EnsureExists(_ *v1.Pod) error {
|
|
return nil
|
|
}
|
|
|
|
func (m *podContainerManagerNoop) GetPodContainerName(_ *v1.Pod) (CgroupName, string) {
|
|
return m.cgroupRoot, m.cgroupRoot.ToCgroupfs()
|
|
}
|
|
|
|
func (m *podContainerManagerNoop) GetPodContainerNameForDriver(_ *v1.Pod) string {
|
|
return ""
|
|
}
|
|
|
|
// Destroy destroys the pod container cgroup paths
|
|
func (m *podContainerManagerNoop) Destroy(_ CgroupName) error {
|
|
return nil
|
|
}
|
|
|
|
func (m *podContainerManagerNoop) ReduceCPULimits(_ CgroupName) error {
|
|
return nil
|
|
}
|
|
|
|
func (m *podContainerManagerNoop) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) {
|
|
return nil, nil
|
|
}
|