
This ports the remaining kubernetes 1.24.4 patches. The following patches were refactored slightly to allow for upstream changes: kubelet-cpumanager-disable-CFS-quota-throttling-for-.patch kubelet-cpumanager-keep-normal-containers-off-reserv.patch The following patch was added to get tests working again: cpumanager-policy-static-test-refactor.patch Test-plan: 1. Revert-use-subpath-for-coredns-only-for-default-repo: Kubeadm commands worked as expected 2. enable-support-for-kubernetes-to-ignore-isolcpus: Set kube-cpu-mgr-policy as static and allocated some isolcpus. i. Deployed a pod with dedicated CPU and verified that it is not affined to isolcpus. ii. kube-ignore-isol-cpus is set to enabled, deployed a pod with dedicated CPU and verified that is allocated to isolated CPU. 3. kubeadm-create-platform-pods-with-zero-CPU-resources: Verified the usage of CPUs is 0 in coredns, kube-controller-manager, kube-scheduler, and kube-apiserver pods of kube-system namespace. 4. kubelet-cpumanager-disable-CFS-quota-throttling-for- : Verified that pods that in the "Guaranteed" QoS class, on hosts that have "kube-cpu-mgr-policy=static" have cpu.cfs_quota_us set to -1. 5. kubelet-cpumanager-infra-pods-use-system-reserved-CP: Verified that platform pods are affined to platform CPUs 6. kubelet-cpumanager-introduce-concept-of-isolated-CPU: Verified pods can allocate isolated CPUs and are affined to them. Verified pods allocating application CPUs don't get isolated CPUs. Verified that pods allocating dedicated and isolated CPUs are affined to the dedicated CPUs. Verified that pods allocating non-dedicated and isolated CPUs are affined to the isolated CPUs. 7. kubelet-cpumanager-keep-normal-containers-off-reserv: Verified the pod which is not in platform namespace are affined to application or application isolated CPUs 8. kubelet-sort-isolcpus-allocation-when-SMT-enabled: Verified after enabling SMT multithreading that isolated CPUs are allocated as lowest-numbered SMT siblings first and then higher-numbered SMT siblings, then any single thread. 9. kubernetes-make-isolcpus-allocation-SMT-aware: Verified after enabling SMT multithreading that isolated CPUs are allocated as pairs of SMT siblings first, then already-existing single SMT siblings, then we allocate one of a pair of SMT siblings as a last resort. Story: 2010301 Task: 46315 Signed-off-by: Sachin Gopala Krishna <saching.krishna@windriver.com> Signed-off-by: Chris Friesen <chris.friesen@windriver.com> Change-Id: Ic8f3d53f58f09ae13f9c299fb31e5f91a0a5bc9f
256 lines
12 KiB
Diff
256 lines
12 KiB
Diff
From 95e547b2d3d0af6b0f2083c064bcbdbe39716250 Mon Sep 17 00:00:00 2001
|
|
From: Sachin Gopala Krishna <saching.krishna@windriver.com>
|
|
Date: Mon, 3 Oct 2022 19:19:48 -0400
|
|
Subject: [PATCH] kubelet cpumanager disable CFS quota throttling
|
|
|
|
This disables CFS CPU quota to avoid performance degradation due to
|
|
Linux kernel CFS quota implementation. Note that 4.18 kernel attempts
|
|
to solve the CFS throttling problem, but there are reports that it is
|
|
not completely effective.
|
|
|
|
This disables CFS quota throttling for Guaranteed pods for both
|
|
parent and container cgroups by writing -1 to cgroup cpu.cfs_quota_us.
|
|
Disabling has a dramatic latency improvement for HTTP response times.
|
|
|
|
This patch is refactored in 1.22.5 due to new internal_container_lifecycle
|
|
framework. We leverage the same mechanism to set Linux resources as:
|
|
cpu manager: specify the container CPU set during the creation
|
|
|
|
Co-authored-by: Jim Gauld <james.gauld@windriver.com>
|
|
Signed-off-by: Sachin Gopala Krishna <saching.krishna@windriver.com>
|
|
---
|
|
pkg/kubelet/cm/cpumanager/cpu_manager.go | 7 +++
|
|
pkg/kubelet/cm/cpumanager/fake_cpu_manager.go | 10 ++++-
|
|
pkg/kubelet/cm/helpers_linux.go | 10 +++++
|
|
pkg/kubelet/cm/helpers_linux_test.go | 43 ++++++++++---------
|
|
.../cm/internal_container_lifecycle_linux.go | 9 ++++
|
|
5 files changed, 57 insertions(+), 22 deletions(-)
|
|
|
|
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go
|
|
index dde49b6ec8c..df431b06601 100644
|
|
--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go
|
|
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go
|
|
@@ -72,6 +72,9 @@ type Manager interface {
|
|
// State returns a read-only interface to the internal CPU manager state.
|
|
State() state.Reader
|
|
|
|
+ // GetCPUPolicy returns the assigned CPU manager policy
|
|
+ GetCPUPolicy() string
|
|
+
|
|
// GetTopologyHints implements the topologymanager.HintProvider Interface
|
|
// and is consulted to achieve NUMA aware resource alignment among this
|
|
// and other resource controllers.
|
|
@@ -314,6 +317,10 @@ func (m *manager) State() state.Reader {
|
|
return m.state
|
|
}
|
|
|
|
+func (m *manager) GetCPUPolicy() string {
|
|
+ return m.policy.Name()
|
|
+}
|
|
+
|
|
func (m *manager) GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
|
|
// The pod is during the admission phase. We need to save the pod to avoid it
|
|
// being cleaned before the admission ended
|
|
diff --git a/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go b/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go
|
|
index 93369705135..2e277da9c84 100644
|
|
--- a/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go
|
|
+++ b/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go
|
|
@@ -28,7 +28,8 @@ import (
|
|
)
|
|
|
|
type fakeManager struct {
|
|
- state state.State
|
|
+ policy Policy
|
|
+ state state.State
|
|
}
|
|
|
|
func (m *fakeManager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap) error {
|
|
@@ -70,6 +71,10 @@ func (m *fakeManager) State() state.Reader {
|
|
return m.state
|
|
}
|
|
|
|
+func (m *fakeManager) GetCPUPolicy() string {
|
|
+ return m.policy.Name()
|
|
+}
|
|
+
|
|
func (m *fakeManager) GetExclusiveCPUs(podUID, containerName string) cpuset.CPUSet {
|
|
klog.InfoS("GetExclusiveCPUs", "podUID", podUID, "containerName", containerName)
|
|
return cpuset.CPUSet{}
|
|
@@ -88,6 +93,7 @@ func (m *fakeManager) GetCPUAffinity(podUID, containerName string) cpuset.CPUSet
|
|
// NewFakeManager creates empty/fake cpu manager
|
|
func NewFakeManager() Manager {
|
|
return &fakeManager{
|
|
- state: state.NewMemoryState(),
|
|
+ policy: &nonePolicy{},
|
|
+ state: state.NewMemoryState(),
|
|
}
|
|
}
|
|
diff --git a/pkg/kubelet/cm/helpers_linux.go b/pkg/kubelet/cm/helpers_linux.go
|
|
index 25ff3f13b82..e9ea6bab8dc 100644
|
|
--- a/pkg/kubelet/cm/helpers_linux.go
|
|
+++ b/pkg/kubelet/cm/helpers_linux.go
|
|
@@ -182,6 +182,16 @@ func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64,
|
|
// build the result
|
|
result := &ResourceConfig{}
|
|
if qosClass == v1.PodQOSGuaranteed {
|
|
+ // Disable CFS CPU quota to avoid performance degradation due to
|
|
+ // Linux kernel CFS throttle implementation.
|
|
+ // NOTE: 4.18 kernel attempts to solve CFS throttling problem,
|
|
+ // but there are reports that it is not completely effective.
|
|
+ // This will configure cgroup CFS parameters at pod level:
|
|
+ // /sys/fs/cgroup/cpu/k8s-infra/kubepods/<pod>/cpu.cfs_quota_us
|
|
+ // /sys/fs/cgroup/cpu/k8s-infra/kubepods/<pod>/cpu.cfs_period_us
|
|
+ cpuQuota = int64(-1)
|
|
+ cpuPeriod = uint64(100000)
|
|
+
|
|
result.CpuShares = &cpuShares
|
|
result.CpuQuota = &cpuQuota
|
|
result.CpuPeriod = &cpuPeriod
|
|
diff --git a/pkg/kubelet/cm/helpers_linux_test.go b/pkg/kubelet/cm/helpers_linux_test.go
|
|
index 101b21e682a..9b98fb7e1c1 100644
|
|
--- a/pkg/kubelet/cm/helpers_linux_test.go
|
|
+++ b/pkg/kubelet/cm/helpers_linux_test.go
|
|
@@ -64,8 +64,9 @@ func TestResourceConfigForPod(t *testing.T) {
|
|
burstablePartialShares := MilliCPUToShares(200)
|
|
burstableQuota := MilliCPUToQuota(200, int64(defaultQuotaPeriod))
|
|
guaranteedShares := MilliCPUToShares(100)
|
|
- guaranteedQuota := MilliCPUToQuota(100, int64(defaultQuotaPeriod))
|
|
- guaranteedTunedQuota := MilliCPUToQuota(100, int64(tunedQuotaPeriod))
|
|
+ guaranteedQuotaPeriod := uint64(100000)
|
|
+ guaranteedQuota := int64(-1)
|
|
+ guaranteedTunedQuota := int64(-1)
|
|
memoryQuantity = resource.MustParse("100Mi")
|
|
cpuNoLimit := int64(-1)
|
|
guaranteedMemory := memoryQuantity.Value()
|
|
@@ -204,8 +205,8 @@ func TestResourceConfigForPod(t *testing.T) {
|
|
},
|
|
},
|
|
enforceCPULimits: true,
|
|
- quotaPeriod: defaultQuotaPeriod,
|
|
- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedQuota, CpuPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory},
|
|
+ quotaPeriod: guaranteedQuotaPeriod,
|
|
+ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedQuota, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
|
|
},
|
|
"guaranteed-no-cpu-enforcement": {
|
|
pod: &v1.Pod{
|
|
@@ -218,8 +219,8 @@ func TestResourceConfigForPod(t *testing.T) {
|
|
},
|
|
},
|
|
enforceCPULimits: false,
|
|
- quotaPeriod: defaultQuotaPeriod,
|
|
- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory},
|
|
+ quotaPeriod: guaranteedQuotaPeriod,
|
|
+ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
|
|
},
|
|
"guaranteed-with-tuned-quota": {
|
|
pod: &v1.Pod{
|
|
@@ -232,8 +233,8 @@ func TestResourceConfigForPod(t *testing.T) {
|
|
},
|
|
},
|
|
enforceCPULimits: true,
|
|
- quotaPeriod: tunedQuotaPeriod,
|
|
- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedTunedQuota, CpuPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory},
|
|
+ quotaPeriod: guaranteedQuotaPeriod,
|
|
+ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedTunedQuota, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
|
|
},
|
|
"guaranteed-no-cpu-enforcement-with-tuned-quota": {
|
|
pod: &v1.Pod{
|
|
@@ -246,8 +247,8 @@ func TestResourceConfigForPod(t *testing.T) {
|
|
},
|
|
},
|
|
enforceCPULimits: false,
|
|
- quotaPeriod: tunedQuotaPeriod,
|
|
- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory},
|
|
+ quotaPeriod: guaranteedQuotaPeriod,
|
|
+ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
|
|
},
|
|
"burstable-partial-limits-with-init-containers": {
|
|
pod: &v1.Pod{
|
|
@@ -309,8 +310,10 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) {
|
|
burstablePartialShares := MilliCPUToShares(200)
|
|
burstableQuota := MilliCPUToQuota(200, int64(defaultQuotaPeriod))
|
|
guaranteedShares := MilliCPUToShares(100)
|
|
- guaranteedQuota := MilliCPUToQuota(100, int64(defaultQuotaPeriod))
|
|
- guaranteedTunedQuota := MilliCPUToQuota(100, int64(tunedQuotaPeriod))
|
|
+ guaranteedQuotaPeriod := uint64(100000)
|
|
+ guaranteedQuota := int64(-1)
|
|
+ guaranteedTunedQuota := int64(-1)
|
|
+
|
|
memoryQuantity = resource.MustParse("100Mi")
|
|
cpuNoLimit := int64(-1)
|
|
guaranteedMemory := memoryQuantity.Value()
|
|
@@ -449,8 +452,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) {
|
|
},
|
|
},
|
|
enforceCPULimits: true,
|
|
- quotaPeriod: defaultQuotaPeriod,
|
|
- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedQuota, CpuPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory},
|
|
+ quotaPeriod: guaranteedQuotaPeriod,
|
|
+ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedQuota, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
|
|
},
|
|
"guaranteed-no-cpu-enforcement": {
|
|
pod: &v1.Pod{
|
|
@@ -463,8 +466,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) {
|
|
},
|
|
},
|
|
enforceCPULimits: false,
|
|
- quotaPeriod: defaultQuotaPeriod,
|
|
- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory},
|
|
+ quotaPeriod: guaranteedQuotaPeriod,
|
|
+ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
|
|
},
|
|
"guaranteed-with-tuned-quota": {
|
|
pod: &v1.Pod{
|
|
@@ -477,8 +480,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) {
|
|
},
|
|
},
|
|
enforceCPULimits: true,
|
|
- quotaPeriod: tunedQuotaPeriod,
|
|
- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedTunedQuota, CpuPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory},
|
|
+ quotaPeriod: guaranteedQuotaPeriod,
|
|
+ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedTunedQuota, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
|
|
},
|
|
"guaranteed-no-cpu-enforcement-with-tuned-quota": {
|
|
pod: &v1.Pod{
|
|
@@ -491,8 +494,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) {
|
|
},
|
|
},
|
|
enforceCPULimits: false,
|
|
- quotaPeriod: tunedQuotaPeriod,
|
|
- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory},
|
|
+ quotaPeriod: guaranteedQuotaPeriod,
|
|
+ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
|
|
},
|
|
}
|
|
|
|
diff --git a/pkg/kubelet/cm/internal_container_lifecycle_linux.go b/pkg/kubelet/cm/internal_container_lifecycle_linux.go
|
|
index cb7c0cfa543..75406dd8564 100644
|
|
--- a/pkg/kubelet/cm/internal_container_lifecycle_linux.go
|
|
+++ b/pkg/kubelet/cm/internal_container_lifecycle_linux.go
|
|
@@ -25,6 +25,7 @@ import (
|
|
|
|
"k8s.io/api/core/v1"
|
|
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
|
|
+ v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
|
|
)
|
|
|
|
func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, container *v1.Container, containerConfig *runtimeapi.ContainerConfig) error {
|
|
@@ -35,6 +36,14 @@ func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, contain
|
|
}
|
|
}
|
|
|
|
+ // Disable cgroup CFS throttle at the container level.
|
|
+ // /sys/fs/cgroup/cpu/k8s-infra/kubepods/<pod>/<container>/cpu.cfs_quota_us
|
|
+ // /sys/fs/cgroup/cpu/k8s-infra/kubepods/<pod>/<container>/cpu.cfs_period_us
|
|
+ if i.cpuManager.GetCPUPolicy() == "static" && v1qos.GetPodQOS(pod) == v1.PodQOSGuaranteed {
|
|
+ containerConfig.Linux.Resources.CpuPeriod = int64(100000)
|
|
+ containerConfig.Linux.Resources.CpuQuota = int64(-1)
|
|
+ }
|
|
+
|
|
if i.memoryManager != nil {
|
|
numaNodes := i.memoryManager.GetMemoryNUMANodes(pod, container)
|
|
if numaNodes.Len() > 0 {
|
|
--
|
|
2.25.1
|
|
|