Kubelet changes to support hugepages

This commit is contained in:
Derek Carr 2017-08-17 14:28:15 -04:00
parent afd8045ed7
commit 1ec2a69d9a
9 changed files with 154 additions and 6 deletions

View File

@ -23,11 +23,14 @@ go_library(
"//conditions:default": [], "//conditions:default": [],
}), }),
deps = [ deps = [
"//pkg/api/v1/helper:go_default_library",
"//pkg/features:go_default_library",
"//vendor/github.com/google/cadvisor/events:go_default_library", "//vendor/github.com/google/cadvisor/events:go_default_library",
"//vendor/github.com/google/cadvisor/info/v1:go_default_library", "//vendor/github.com/google/cadvisor/info/v1:go_default_library",
"//vendor/github.com/google/cadvisor/info/v2:go_default_library", "//vendor/github.com/google/cadvisor/info/v2:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library", "//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
"//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library",
] + select({ ] + select({
"@io_bazel_rules_go//go/platform:linux_amd64": [ "@io_bazel_rules_go//go/platform:linux_amd64": [
"//pkg/kubelet/types:go_default_library", "//pkg/kubelet/types:go_default_library",

View File

@ -21,6 +21,9 @@ import (
cadvisorapi2 "github.com/google/cadvisor/info/v2" cadvisorapi2 "github.com/google/cadvisor/info/v2"
"k8s.io/api/core/v1" "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/api/resource"
utilfeature "k8s.io/apiserver/pkg/util/feature"
v1helper "k8s.io/kubernetes/pkg/api/v1/helper"
"k8s.io/kubernetes/pkg/features"
) )
func CapacityFromMachineInfo(info *cadvisorapi.MachineInfo) v1.ResourceList { func CapacityFromMachineInfo(info *cadvisorapi.MachineInfo) v1.ResourceList {
@ -32,6 +35,17 @@ func CapacityFromMachineInfo(info *cadvisorapi.MachineInfo) v1.ResourceList {
int64(info.MemoryCapacity), int64(info.MemoryCapacity),
resource.BinarySI), resource.BinarySI),
} }
// if huge pages are enabled, we report them as a schedulable resource on the node
if utilfeature.DefaultFeatureGate.Enabled(features.HugePages) {
for _, hugepagesInfo := range info.HugePages {
pageSizeBytes := int64(hugepagesInfo.PageSize * 1024)
hugePagesBytes := pageSizeBytes * int64(hugepagesInfo.NumPages)
pageSizeQuantity := resource.NewQuantity(pageSizeBytes, resource.BinarySI)
c[v1helper.HugePageResourceName(*pageSizeQuantity)] = *resource.NewQuantity(hugePagesBytes, resource.BinarySI)
}
}
return c return c
} }

View File

@ -52,6 +52,7 @@ go_library(
] + select({ ] + select({
"@io_bazel_rules_go//go/platform:linux_amd64": [ "@io_bazel_rules_go//go/platform:linux_amd64": [
"//pkg/api:go_default_library", "//pkg/api:go_default_library",
"//pkg/api/v1/helper:go_default_library",
"//pkg/api/v1/helper/qos:go_default_library", "//pkg/api/v1/helper/qos:go_default_library",
"//pkg/api/v1/resource:go_default_library", "//pkg/api/v1/resource:go_default_library",
"//pkg/kubelet/cm/util:go_default_library", "//pkg/kubelet/cm/util:go_default_library",
@ -63,6 +64,7 @@ go_library(
"//pkg/util/procfs:go_default_library", "//pkg/util/procfs:go_default_library",
"//pkg/util/sysctl:go_default_library", "//pkg/util/sysctl:go_default_library",
"//pkg/util/version:go_default_library", "//pkg/util/version:go_default_library",
"//vendor/github.com/docker/go-units:go_default_library",
"//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library",
"//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs:go_default_library",
"//vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd:go_default_library",

View File

@ -24,12 +24,16 @@ import (
"strings" "strings"
"time" "time"
units "github.com/docker/go-units"
"github.com/golang/glog" "github.com/golang/glog"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs" cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs"
cgroupsystemd "github.com/opencontainers/runc/libcontainer/cgroups/systemd" cgroupsystemd "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
libcontainerconfigs "github.com/opencontainers/runc/libcontainer/configs" libcontainerconfigs "github.com/opencontainers/runc/libcontainer/configs"
"k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/sets"
utilfeature "k8s.io/apiserver/pkg/util/feature"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/metrics" "k8s.io/kubernetes/pkg/kubelet/metrics"
) )
@ -43,6 +47,10 @@ const (
libcontainerSystemd libcontainerCgroupManagerType = "systemd" libcontainerSystemd libcontainerCgroupManagerType = "systemd"
) )
// hugePageSizeList is useful for converting to the hugetlb canonical unit
// which is what is expected when interacting with libcontainer
var hugePageSizeList = []string{"", "kB", "MB", "GB", "TB", "PB"}
// ConvertCgroupNameToSystemd converts the internal cgroup name to a systemd name. // ConvertCgroupNameToSystemd converts the internal cgroup name to a systemd name.
// For example, the name /Burstable/pod_123-456 becomes Burstable-pod_123_456.slice // For example, the name /Burstable/pod_123-456 becomes Burstable-pod_123_456.slice
// If outputToCgroupFs is true, it expands the systemd name into the cgroupfs form. // If outputToCgroupFs is true, it expands the systemd name into the cgroupfs form.
@ -299,10 +307,16 @@ type subsystem interface {
GetStats(path string, stats *libcontainercgroups.Stats) error GetStats(path string, stats *libcontainercgroups.Stats) error
} }
// Cgroup subsystems we currently support // getSupportedSubsystems returns list of subsystems supported
var supportedSubsystems = []subsystem{ func getSupportedSubsystems() []subsystem {
supportedSubsystems := []subsystem{
&cgroupfs.MemoryGroup{}, &cgroupfs.MemoryGroup{},
&cgroupfs.CpuGroup{}, &cgroupfs.CpuGroup{},
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
supportedSubsystems = append(supportedSubsystems, &cgroupfs.HugetlbGroup{})
}
return supportedSubsystems
} }
// setSupportedSubsystems sets cgroup resource limits only on the supported // setSupportedSubsystems sets cgroup resource limits only on the supported
@ -315,7 +329,7 @@ var supportedSubsystems = []subsystem{
// but this is not possible with libcontainers Set() method // but this is not possible with libcontainers Set() method
// See https://github.com/opencontainers/runc/issues/932 // See https://github.com/opencontainers/runc/issues/932
func setSupportedSubsystems(cgroupConfig *libcontainerconfigs.Cgroup) error { func setSupportedSubsystems(cgroupConfig *libcontainerconfigs.Cgroup) error {
for _, sys := range supportedSubsystems { for _, sys := range getSupportedSubsystems() {
if _, ok := cgroupConfig.Paths[sys.Name()]; !ok { if _, ok := cgroupConfig.Paths[sys.Name()]; !ok {
return fmt.Errorf("Failed to find subsystem mount for subsystem: %v", sys.Name()) return fmt.Errorf("Failed to find subsystem mount for subsystem: %v", sys.Name())
} }
@ -343,6 +357,30 @@ func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcont
if resourceConfig.CpuPeriod != nil { if resourceConfig.CpuPeriod != nil {
resources.CpuPeriod = *resourceConfig.CpuPeriod resources.CpuPeriod = *resourceConfig.CpuPeriod
} }
// if huge pages are enabled, we set them in libcontainer
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
// for each page size enumerated, set that value
pageSizes := sets.NewString()
for pageSize, limit := range resourceConfig.HugePageLimit {
sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, hugePageSizeList)
resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{
Pagesize: sizeString,
Limit: uint64(limit),
})
pageSizes.Insert(sizeString)
}
// for each page size omitted, limit to 0
for _, pageSize := range cgroupfs.HugePageSizes {
if pageSizes.Has(pageSize) {
continue
}
resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{
Pagesize: pageSize,
Limit: uint64(0),
})
}
}
return resources return resources
} }
@ -502,7 +540,7 @@ func (m *cgroupManagerImpl) ReduceCPULimits(cgroupName CgroupName) error {
func getStatsSupportedSubsystems(cgroupPaths map[string]string) (*libcontainercgroups.Stats, error) { func getStatsSupportedSubsystems(cgroupPaths map[string]string) (*libcontainercgroups.Stats, error) {
stats := libcontainercgroups.NewStats() stats := libcontainercgroups.NewStats()
for _, sys := range supportedSubsystems { for _, sys := range getSupportedSubsystems() {
if _, ok := cgroupPaths[sys.Name()]; !ok { if _, ok := cgroupPaths[sys.Name()]; !ok {
return nil, fmt.Errorf("Failed to find subsystem mount for subsystem: %v", sys.Name()) return nil, fmt.Errorf("Failed to find subsystem mount for subsystem: %v", sys.Name())
} }

View File

@ -26,6 +26,7 @@ import (
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
"k8s.io/api/core/v1" "k8s.io/api/core/v1"
v1helper "k8s.io/kubernetes/pkg/api/v1/helper"
v1qos "k8s.io/kubernetes/pkg/api/v1/helper/qos" v1qos "k8s.io/kubernetes/pkg/api/v1/helper/qos"
"k8s.io/kubernetes/pkg/api/v1/resource" "k8s.io/kubernetes/pkg/api/v1/resource"
) )
@ -83,6 +84,23 @@ func MilliCPUToShares(milliCPU int64) int64 {
return shares return shares
} }
// HugePageLimits converts the API representation to a map
// from huge page size (in bytes) to huge page limit (in bytes).
func HugePageLimits(resourceList v1.ResourceList) map[int64]int64 {
hugePageLimits := map[int64]int64{}
for k, v := range resourceList {
if v1helper.IsHugePageResourceName(k) {
pageSize, _ := v1helper.HugePageSizeFromResourceName(k)
if value, exists := hugePageLimits[pageSize.Value()]; exists {
hugePageLimits[pageSize.Value()] = value + v.Value()
} else {
hugePageLimits[pageSize.Value()] = v.Value()
}
}
}
return hugePageLimits
}
// ResourceConfigForPod takes the input pod and outputs the cgroup resource config. // ResourceConfigForPod takes the input pod and outputs the cgroup resource config.
func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig { func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig {
// sum requests and limits. // sum requests and limits.
@ -108,6 +126,8 @@ func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig {
// track if limits were applied for each resource. // track if limits were applied for each resource.
memoryLimitsDeclared := true memoryLimitsDeclared := true
cpuLimitsDeclared := true cpuLimitsDeclared := true
// map hugepage pagesize (bytes) to limits (bytes)
hugePageLimits := map[int64]int64{}
for _, container := range pod.Spec.Containers { for _, container := range pod.Spec.Containers {
if container.Resources.Limits.Cpu().IsZero() { if container.Resources.Limits.Cpu().IsZero() {
cpuLimitsDeclared = false cpuLimitsDeclared = false
@ -115,6 +135,14 @@ func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig {
if container.Resources.Limits.Memory().IsZero() { if container.Resources.Limits.Memory().IsZero() {
memoryLimitsDeclared = false memoryLimitsDeclared = false
} }
containerHugePageLimits := HugePageLimits(container.Resources.Requests)
for k, v := range containerHugePageLimits {
if value, exists := hugePageLimits[k]; exists {
hugePageLimits[k] = value + v
} else {
hugePageLimits[k] = v
}
}
} }
// determine the qos class // determine the qos class
@ -140,6 +168,7 @@ func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig {
shares := int64(MinShares) shares := int64(MinShares)
result.CpuShares = &shares result.CpuShares = &shares
} }
result.HugePageLimit = hugePageLimits
return result return result
} }

View File

@ -28,7 +28,9 @@ import (
"k8s.io/api/core/v1" "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/types"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/events" "k8s.io/kubernetes/pkg/kubelet/events"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
) )
@ -154,6 +156,10 @@ func getCgroupConfig(rl v1.ResourceList) *ResourceConfig {
val := MilliCPUToShares(q.MilliValue()) val := MilliCPUToShares(q.MilliValue())
rc.CpuShares = &val rc.CpuShares = &val
} }
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
rc.HugePageLimit = HugePageLimits(rl)
}
return &rc return &rc
} }

View File

@ -27,9 +27,13 @@ import (
"k8s.io/apimachinery/pkg/util/wait" "k8s.io/apimachinery/pkg/util/wait"
units "github.com/docker/go-units"
cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs"
"k8s.io/api/core/v1" "k8s.io/api/core/v1"
utilfeature "k8s.io/apiserver/pkg/util/feature"
v1qos "k8s.io/kubernetes/pkg/api/v1/helper/qos" v1qos "k8s.io/kubernetes/pkg/api/v1/helper/qos"
"k8s.io/kubernetes/pkg/api/v1/resource" "k8s.io/kubernetes/pkg/api/v1/resource"
kubefeatures "k8s.io/kubernetes/pkg/features"
) )
const ( const (
@ -100,11 +104,18 @@ func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceLis
minShares := int64(MinShares) minShares := int64(MinShares)
resourceParameters.CpuShares = &minShares resourceParameters.CpuShares = &minShares
} }
// containerConfig object stores the cgroup specifications // containerConfig object stores the cgroup specifications
containerConfig := &CgroupConfig{ containerConfig := &CgroupConfig{
Name: absoluteContainerName, Name: absoluteContainerName,
ResourceParameters: resourceParameters, ResourceParameters: resourceParameters,
} }
// for each enumerated huge page size, the qos tiers are unbounded
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
m.setHugePagesUnbounded(containerConfig)
}
// check if it exists // check if it exists
if !cm.Exists(absoluteContainerName) { if !cm.Exists(absoluteContainerName) {
if err := cm.Create(containerConfig); err != nil { if err := cm.Create(containerConfig); err != nil {
@ -138,6 +149,29 @@ func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceLis
return nil return nil
} }
// setHugePagesUnbounded ensures hugetlb is effectively unbounded
func (m *qosContainerManagerImpl) setHugePagesUnbounded(cgroupConfig *CgroupConfig) error {
hugePageLimit := map[int64]int64{}
for _, pageSize := range cgroupfs.HugePageSizes {
pageSizeBytes, err := units.RAMInBytes(pageSize)
if err != nil {
return err
}
hugePageLimit[pageSizeBytes] = int64(1 << 62)
}
cgroupConfig.ResourceParameters.HugePageLimit = hugePageLimit
return nil
}
func (m *qosContainerManagerImpl) setHugePagesConfig(configs map[v1.PodQOSClass]*CgroupConfig) error {
for _, v := range configs {
if err := m.setHugePagesUnbounded(v); err != nil {
return err
}
}
return nil
}
func (m *qosContainerManagerImpl) setCPUCgroupConfig(configs map[v1.PodQOSClass]*CgroupConfig) error { func (m *qosContainerManagerImpl) setCPUCgroupConfig(configs map[v1.PodQOSClass]*CgroupConfig) error {
pods := m.activePods() pods := m.activePods()
burstablePodCPURequest := int64(0) burstablePodCPURequest := int64(0)
@ -262,6 +296,13 @@ func (m *qosContainerManagerImpl) UpdateCgroups() error {
return err return err
} }
// update the qos level cgroup settings for huge pages (ensure they remain unbounded)
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
if err := m.setHugePagesConfig(qosConfigs); err != nil {
return err
}
}
for resource, percentReserve := range m.qosReserved { for resource, percentReserve := range m.qosReserved {
switch resource { switch resource {
case v1.ResourceMemory: case v1.ResourceMemory:

View File

@ -31,6 +31,8 @@ type ResourceConfig struct {
CpuQuota *int64 CpuQuota *int64
// CPU quota period. // CPU quota period.
CpuPeriod *int64 CpuPeriod *int64
// HugePageLimit map from page size (in bytes) to limit (in bytes)
HugePageLimit map[int64]int64
} }
// CgroupName is the abstract name of a cgroup prior to any driver specific conversion. // CgroupName is the abstract name of a cgroup prior to any driver specific conversion.

View File

@ -630,6 +630,19 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
} }
node.Status.Allocatable[k] = value node.Status.Allocatable[k] = value
} }
// for every huge page reservation, we need to remove it from allocatable memory
for k, v := range node.Status.Capacity {
if v1helper.IsHugePageResourceName(k) {
allocatableMemory := node.Status.Allocatable[v1.ResourceMemory]
value := *(v.Copy())
allocatableMemory.Sub(value)
if allocatableMemory.Sign() < 0 {
// Negative Allocatable resources don't make sense.
allocatableMemory.Set(0)
}
node.Status.Allocatable[v1.ResourceMemory] = allocatableMemory
}
}
} }
// Set versioninfo for the node. // Set versioninfo for the node.