From 752e3e88d162aada55282aea7544e458e610c947 Mon Sep 17 00:00:00 2001 From: Boovan Rajendran Date: Wed, 30 Aug 2023 06:01:30 -0400 Subject: [PATCH] kubelet cpumanager disable CFS quota throttling This disables CFS CPU quota to avoid performance degradation due to Linux kernel CFS quota implementation. Note that 4.18 kernel attempts to solve the CFS throttling problem, but there are reports that it is not completely effective. This disables CFS quota throttling for Guaranteed pods for both parent and container cgroups by writing -1 to cgroup cpu.cfs_quota_us. Disabling has a dramatic latency improvement for HTTP response times. This patch is refactored in 1.22.5 due to new internal_container_lifecycle framework. We leverage the same mechanism to set Linux resources as: cpu manager: specify the container CPU set during the creation Co-authored-by: Jim Gauld Signed-off-by: Sachin Gopala Krishna Signed-off-by: Boovan Rajendran --- pkg/kubelet/cm/cpumanager/cpu_manager.go | 7 +++ pkg/kubelet/cm/cpumanager/fake_cpu_manager.go | 10 ++++- pkg/kubelet/cm/helpers_linux.go | 10 +++++ pkg/kubelet/cm/helpers_linux_test.go | 43 ++++++++++--------- .../cm/internal_container_lifecycle_linux.go | 9 ++++ 5 files changed, 57 insertions(+), 22 deletions(-) diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go index 8b5049d7d74..1d8901f4b36 100644 --- a/pkg/kubelet/cm/cpumanager/cpu_manager.go +++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go @@ -73,6 +73,9 @@ type Manager interface { // State returns a read-only interface to the internal CPU manager state. State() state.Reader + // GetCPUPolicy returns the assigned CPU manager policy + GetCPUPolicy() string + // GetTopologyHints implements the topologymanager.HintProvider Interface // and is consulted to achieve NUMA aware resource alignment among this // and other resource controllers. @@ -315,6 +318,10 @@ func (m *manager) State() state.Reader { return m.state } +func (m *manager) GetCPUPolicy() string { + return m.policy.Name() +} + func (m *manager) GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint { // The pod is during the admission phase. We need to save the pod to avoid it // being cleaned before the admission ended diff --git a/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go b/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go index 4a03f3dd23f..b36fb0da3b4 100644 --- a/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go +++ b/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go @@ -28,7 +28,8 @@ import ( ) type fakeManager struct { - state state.State + policy Policy + state state.State } func (m *fakeManager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap) error { @@ -70,6 +71,10 @@ func (m *fakeManager) State() state.Reader { return m.state } +func (m *fakeManager) GetCPUPolicy() string { + return m.policy.Name() +} + func (m *fakeManager) GetExclusiveCPUs(podUID, containerName string) cpuset.CPUSet { klog.InfoS("GetExclusiveCPUs", "podUID", podUID, "containerName", containerName) return cpuset.CPUSet{} @@ -88,6 +93,7 @@ func (m *fakeManager) GetCPUAffinity(podUID, containerName string) cpuset.CPUSet // NewFakeManager creates empty/fake cpu manager func NewFakeManager() Manager { return &fakeManager{ - state: state.NewMemoryState(), + policy: &nonePolicy{}, + state: state.NewMemoryState(), } } diff --git a/pkg/kubelet/cm/helpers_linux.go b/pkg/kubelet/cm/helpers_linux.go index 8a144e7a73c..008b955ee98 100644 --- a/pkg/kubelet/cm/helpers_linux.go +++ b/pkg/kubelet/cm/helpers_linux.go @@ -170,6 +170,16 @@ func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64, // build the result result := &ResourceConfig{} if qosClass == v1.PodQOSGuaranteed { + // Disable CFS CPU quota to avoid performance degradation due to + // Linux kernel CFS throttle implementation. + // NOTE: 4.18 kernel attempts to solve CFS throttling problem, + // but there are reports that it is not completely effective. + // This will configure cgroup CFS parameters at pod level: + // /sys/fs/cgroup/cpu/k8s-infra/kubepods//cpu.cfs_quota_us + // /sys/fs/cgroup/cpu/k8s-infra/kubepods//cpu.cfs_period_us + cpuQuota = int64(-1) + cpuPeriod = uint64(100000) + result.CPUShares = &cpuShares result.CPUQuota = &cpuQuota result.CPUPeriod = &cpuPeriod diff --git a/pkg/kubelet/cm/helpers_linux_test.go b/pkg/kubelet/cm/helpers_linux_test.go index fba41fd49be..60609394659 100644 --- a/pkg/kubelet/cm/helpers_linux_test.go +++ b/pkg/kubelet/cm/helpers_linux_test.go @@ -64,8 +64,9 @@ func TestResourceConfigForPod(t *testing.T) { burstablePartialShares := MilliCPUToShares(200) burstableQuota := MilliCPUToQuota(200, int64(defaultQuotaPeriod)) guaranteedShares := MilliCPUToShares(100) - guaranteedQuota := MilliCPUToQuota(100, int64(defaultQuotaPeriod)) - guaranteedTunedQuota := MilliCPUToQuota(100, int64(tunedQuotaPeriod)) + guaranteedQuotaPeriod := uint64(100000) + guaranteedQuota := int64(-1) + guaranteedTunedQuota := int64(-1) memoryQuantity = resource.MustParse("100Mi") cpuNoLimit := int64(-1) guaranteedMemory := memoryQuantity.Value() @@ -204,8 +205,8 @@ func TestResourceConfigForPod(t *testing.T) { }, }, enforceCPULimits: true, - quotaPeriod: defaultQuotaPeriod, - expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedQuota, CPUPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory}, + quotaPeriod: guaranteedQuotaPeriod, + expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedQuota, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, }, "guaranteed-no-cpu-enforcement": { pod: &v1.Pod{ @@ -218,8 +219,8 @@ func TestResourceConfigForPod(t *testing.T) { }, }, enforceCPULimits: false, - quotaPeriod: defaultQuotaPeriod, - expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory}, + quotaPeriod: guaranteedQuotaPeriod, + expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, }, "guaranteed-with-tuned-quota": { pod: &v1.Pod{ @@ -232,8 +233,8 @@ func TestResourceConfigForPod(t *testing.T) { }, }, enforceCPULimits: true, - quotaPeriod: tunedQuotaPeriod, - expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedTunedQuota, CPUPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory}, + quotaPeriod: guaranteedQuotaPeriod, + expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedTunedQuota, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, }, "guaranteed-no-cpu-enforcement-with-tuned-quota": { pod: &v1.Pod{ @@ -246,8 +247,8 @@ func TestResourceConfigForPod(t *testing.T) { }, }, enforceCPULimits: false, - quotaPeriod: tunedQuotaPeriod, - expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory}, + quotaPeriod: guaranteedQuotaPeriod, + expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, }, "burstable-partial-limits-with-init-containers": { pod: &v1.Pod{ @@ -309,8 +310,10 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) { burstablePartialShares := MilliCPUToShares(200) burstableQuota := MilliCPUToQuota(200, int64(defaultQuotaPeriod)) guaranteedShares := MilliCPUToShares(100) - guaranteedQuota := MilliCPUToQuota(100, int64(defaultQuotaPeriod)) - guaranteedTunedQuota := MilliCPUToQuota(100, int64(tunedQuotaPeriod)) + guaranteedQuotaPeriod := uint64(100000) + guaranteedQuota := int64(-1) + guaranteedTunedQuota := int64(-1) + memoryQuantity = resource.MustParse("100Mi") cpuNoLimit := int64(-1) guaranteedMemory := memoryQuantity.Value() @@ -449,8 +452,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) { }, }, enforceCPULimits: true, - quotaPeriod: defaultQuotaPeriod, - expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedQuota, CPUPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory}, + quotaPeriod: guaranteedQuotaPeriod, + expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedQuota, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, }, "guaranteed-no-cpu-enforcement": { pod: &v1.Pod{ @@ -463,8 +466,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) { }, }, enforceCPULimits: false, - quotaPeriod: defaultQuotaPeriod, - expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory}, + quotaPeriod: guaranteedQuotaPeriod, + expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, }, "guaranteed-with-tuned-quota": { pod: &v1.Pod{ @@ -477,8 +480,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) { }, }, enforceCPULimits: true, - quotaPeriod: tunedQuotaPeriod, - expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedTunedQuota, CPUPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory}, + quotaPeriod: guaranteedQuotaPeriod, + expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedTunedQuota, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, }, "guaranteed-no-cpu-enforcement-with-tuned-quota": { pod: &v1.Pod{ @@ -491,8 +494,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) { }, }, enforceCPULimits: false, - quotaPeriod: tunedQuotaPeriod, - expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory}, + quotaPeriod: guaranteedQuotaPeriod, + expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, }, } diff --git a/pkg/kubelet/cm/internal_container_lifecycle_linux.go b/pkg/kubelet/cm/internal_container_lifecycle_linux.go index cb7c0cfa543..a99d01f8884 100644 --- a/pkg/kubelet/cm/internal_container_lifecycle_linux.go +++ b/pkg/kubelet/cm/internal_container_lifecycle_linux.go @@ -25,6 +25,7 @@ import ( "k8s.io/api/core/v1" runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" + v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" ) func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, container *v1.Container, containerConfig *runtimeapi.ContainerConfig) error { @@ -35,6 +36,14 @@ func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, contain } } + // Disable cgroup CFS throttle at the container level. + // /sys/fs/cgroup/cpu/k8s-infra/kubepods///cpu.cfs_quota_us + // /sys/fs/cgroup/cpu/k8s-infra/kubepods///cpu.cfs_period_us + if i.cpuManager.GetCPUPolicy() == "static" && v1qos.GetPodQOS(pod) == v1.PodQOSGuaranteed { + containerConfig.Linux.Resources.CpuPeriod = int64(100000) + containerConfig.Linux.Resources.CpuQuota = int64(-1) + } + if i.memoryManager != nil { numaNodes := i.memoryManager.GetMemoryNUMANodes(pod, container) if numaNodes.Len() > 0 { -- 2.25.1