Porting the K8s customizations to K8s 1.26.1

Taken from the previous version and modified the files for 1.26.1.

The following patches were applied cleanly and are included in
this commit:
enable-support-for-kubernetes-to-ignore-isolcpus.patch
kubeadm-create-platform-pods-with-zero-CPU-resources.patch
kubelet-CFS-quota-throttling-for-non-integer-cpulimit.patch
kubelet-cpumanager-disable-CFS-quota-throttling.patch
kubelet-cpumanager-keep-normal-containers-off-reserv.patch
kubelet-sort-isolcpus-allocation-when-SMT-enabled.patch
Revert-use-subpath-for-coredns-only-for-default-repo.patch

The following patches did not apply cleanly. These were ported
to kubernetes 1.26.1 and included in this commit:
kubelet-cpumanager-infra-pods-use-system-reserved-CP.patch
kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch
kubernetes-make-isolcpus-allocation-SMT-aware.patch

Note:
k8s 1.26.1 is not included in ISO yet

Test Plan:
PASS: Kubernetes package 1.26.1 builds properly.
PASS: Run all Kubelet, kubeadm, kubectl make tests for affected code.

Story: 2010368
Task: 47395

Signed-off-by: Sachin Gopala Krishna <saching.krishna@windriver.com>
Change-Id: I9c8ef9bb036aba503fde0f35ebac08a573375cd3
This commit is contained in:
Sachin Gopala Krishna 2023-03-28 11:31:39 -04:00
parent 32fd4c06d5
commit 36abd4557c
11 changed files with 2070 additions and 0 deletions

View File

@ -0,0 +1,120 @@
From c20052041d1786a78f796996aceca09de76613b7 Mon Sep 17 00:00:00 2001
From: Ramesh Kumar Sivanandam <rameshkumar.sivanandam@windriver.com>
Date: Tue, 25 Oct 2022 19:44:44 -0400
Subject: [PATCH 02/10] Revert "use subpath for coredns only for default
repository"
This reverts commit 38a41e1557649a7cc763bf737779db9aa03ec75e.
Co-authored-by: Jim Gauld <james.gauld@windriver.com>
Signed-off-by: Gleb Aronsky <gleb.aronsky@windriver.com>
Signed-off-by: Ramesh Kumar Sivanandam <rameshkumar.sivanandam@windriver.com>
---
cmd/kubeadm/app/constants/constants.go | 2 +-
cmd/kubeadm/app/images/images.go | 5 ---
cmd/kubeadm/app/images/images_test.go | 49 --------------------------
3 files changed, 1 insertion(+), 55 deletions(-)
diff --git a/cmd/kubeadm/app/constants/constants.go b/cmd/kubeadm/app/constants/constants.go
index 544b5e96828..31ae3502d00 100644
--- a/cmd/kubeadm/app/constants/constants.go
+++ b/cmd/kubeadm/app/constants/constants.go
@@ -342,7 +342,7 @@ const (
CoreDNSDeploymentName = "coredns"
// CoreDNSImageName specifies the name of the image for CoreDNS add-on
- CoreDNSImageName = "coredns"
+ CoreDNSImageName = "coredns/coredns"
// CoreDNSVersion is the version of CoreDNS to be deployed if it is used
CoreDNSVersion = "v1.9.3"
diff --git a/cmd/kubeadm/app/images/images.go b/cmd/kubeadm/app/images/images.go
index 81c787b77a3..5099b260530 100644
--- a/cmd/kubeadm/app/images/images.go
+++ b/cmd/kubeadm/app/images/images.go
@@ -22,7 +22,6 @@ import (
"k8s.io/klog/v2"
kubeadmapi "k8s.io/kubernetes/cmd/kubeadm/app/apis/kubeadm"
- kubeadmapiv1beta2 "k8s.io/kubernetes/cmd/kubeadm/app/apis/kubeadm/v1beta2"
"k8s.io/kubernetes/cmd/kubeadm/app/constants"
kubeadmutil "k8s.io/kubernetes/cmd/kubeadm/app/util"
)
@@ -48,10 +47,6 @@ func GetDNSImage(cfg *kubeadmapi.ClusterConfiguration) string {
if cfg.DNS.ImageRepository != "" {
dnsImageRepository = cfg.DNS.ImageRepository
}
- // Handle the renaming of the official image from "registry.k8s.io/coredns" to "registry.k8s.io/coredns/coredns
- if dnsImageRepository == kubeadmapiv1beta2.DefaultImageRepository {
- dnsImageRepository = fmt.Sprintf("%s/coredns", dnsImageRepository)
- }
// DNS uses an imageTag that corresponds to the DNS version matching the Kubernetes version
dnsImageTag := constants.CoreDNSVersion
diff --git a/cmd/kubeadm/app/images/images_test.go b/cmd/kubeadm/app/images/images_test.go
index 60927ef9493..e24d4feca23 100644
--- a/cmd/kubeadm/app/images/images_test.go
+++ b/cmd/kubeadm/app/images/images_test.go
@@ -22,7 +22,6 @@ import (
"testing"
kubeadmapi "k8s.io/kubernetes/cmd/kubeadm/app/apis/kubeadm"
- kubeadmapiv1beta2 "k8s.io/kubernetes/cmd/kubeadm/app/apis/kubeadm/v1beta2"
"k8s.io/kubernetes/cmd/kubeadm/app/constants"
)
@@ -227,51 +226,3 @@ func TestGetAllImages(t *testing.T) {
}
}
-func TestGetDNSImage(t *testing.T) {
- var tests = []struct {
- expected string
- cfg *kubeadmapi.ClusterConfiguration
- }{
- {
- expected: "foo.io/coredns:v1.9.3",
- cfg: &kubeadmapi.ClusterConfiguration{
- ImageRepository: "foo.io",
- DNS: kubeadmapi.DNS{
- Type: kubeadmapi.CoreDNS,
- },
- },
- },
- {
- expected: kubeadmapiv1beta2.DefaultImageRepository + "/coredns/coredns:v1.9.3",
- cfg: &kubeadmapi.ClusterConfiguration{
- ImageRepository: kubeadmapiv1beta2.DefaultImageRepository,
- DNS: kubeadmapi.DNS{
- Type: kubeadmapi.CoreDNS,
- },
- },
- },
- {
- expected: "foo.io/coredns/coredns:v1.9.3",
- cfg: &kubeadmapi.ClusterConfiguration{
- ImageRepository: "foo.io",
- DNS: kubeadmapi.DNS{
- Type: kubeadmapi.CoreDNS,
- ImageMeta: kubeadmapi.ImageMeta{
- ImageRepository: "foo.io/coredns",
- },
- },
- },
- },
- }
-
- for _, test := range tests {
- actual := GetDNSImage(test.cfg)
- if actual != test.expected {
- t.Errorf(
- "failed to GetDNSImage:\n\texpected: %s\n\t actual: %s",
- test.expected,
- actual,
- )
- }
- }
-}
--
2.25.1

View File

@ -0,0 +1,80 @@
From 47e087ce7045aaf74cb4f3c845785a4cca101c4d Mon Sep 17 00:00:00 2001
From: Chris Friesen <chris.friesen@windriver.com>
Date: Fri, 23 Oct 2020 17:46:10 -0600
Subject: [PATCH 09/10] enable support for kubernetes to ignore isolcpus
The normal mechanisms for allocating isolated CPUs do not allow
a mix of isolated and exclusive CPUs in the same container. In
order to allow this in *very* limited cases where the pod spec
is known in advance we will add the ability to disable the normal
isolcpus behaviour.
If the file "/etc/kubernetes/ignore_isolcpus" exists, then kubelet
will basically forget everything it knows about isolcpus and just
treat them like regular CPUs.
The admin user can then rely on the fact that CPU allocation is
deterministic to ensure that the isolcpus they configure end up being
allocated to the correct pods.
Signed-off-by: Daniel Safta <daniel.safta@windriver.com>
Signed-off-by: Ramesh Kumar Sivanandam <rameshkumar.sivanandam@windriver.com>
---
pkg/kubelet/cm/cpumanager/cpu_manager.go | 8 ++++++++
pkg/kubelet/cm/cpumanager/policy_static.go | 7 +++++++
2 files changed, 15 insertions(+)
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go
index 95a4246e840..846a6d4fbed 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go
@@ -21,6 +21,7 @@ import (
"fmt"
"io/ioutil"
"math"
+ "os"
"strings"
"sync"
"time"
@@ -56,6 +57,13 @@ const cpuManagerStateFileName = "cpu_manager_state"
// get the system-level isolated CPUs
func getIsolcpus() cpuset.CPUSet {
+ // This is a gross hack to basically turn off awareness of isolcpus to enable
+ // isolated cpus to be allocated to pods the same way as non-isolated CPUs.
+ if _, err := os.Stat("/etc/kubernetes/ignore_isolcpus"); err == nil {
+ klog.Infof("[cpumanager] turning off isolcpus awareness")
+ return cpuset.NewCPUSet()
+ }
+
dat, err := ioutil.ReadFile("/sys/devices/system/cpu/isolated")
if err != nil {
klog.Errorf("[cpumanager] unable to read sysfs isolcpus subdir")
diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go
index 8d18ce65309..981e825ff57 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static.go
@@ -18,6 +18,7 @@ package cpumanager
import (
"fmt"
+ "os"
"strconv"
v1 "k8s.io/api/core/v1"
@@ -700,6 +701,12 @@ func isKubeInfra(pod *v1.Pod) bool {
// get the isolated CPUs (if any) from the devices associated with a specific container
func (p *staticPolicy) podIsolCPUs(pod *v1.Pod, container *v1.Container) cpuset.CPUSet {
+ // This is a gross hack to basically turn off awareness of isolcpus to enable
+ // isolated cpus to be allocated to pods the same way as non-isolated CPUs.
+ if _, err := os.Stat("/etc/kubernetes/ignore_isolcpus"); err == nil {
+ return cpuset.NewCPUSet()
+ }
+
// NOTE: This is required for TestStaticPolicyAdd() since makePod() does
// not create UID. We also need a way to properly stub devicemanager.
if len(string(pod.UID)) == 0 {
--
2.25.1

View File

@ -0,0 +1,108 @@
From 721ddbe3f2d5a83eaa4982dde371f9ab4cc48cf6 Mon Sep 17 00:00:00 2001
From: Chris Friesen <chris.friesen@windriver.com>
Date: Fri, 3 Sep 2021 18:05:15 -0400
Subject: [PATCH 01/10] kubeadm: create platform pods with zero CPU resources
We want to specify zero CPU resources when creating the manifests
for the static platform pods, as a workaround for the lack of
separate resource tracking for platform resources.
We also specify zero CPU resources for the coredns deployment.
manifests.go appears to be the main file for this, not sure if the
others are used but I changed them just in case.
Signed-off-by: Daniel Safta <daniel.safta@windriver.com>
---
cluster/addons/dns/coredns/coredns.yaml.base | 2 +-
cluster/addons/dns/coredns/coredns.yaml.in | 2 +-
cluster/addons/dns/coredns/coredns.yaml.sed | 2 +-
cmd/kubeadm/app/phases/addons/dns/manifests.go | 2 +-
cmd/kubeadm/app/phases/controlplane/manifests.go | 6 +++---
5 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/cluster/addons/dns/coredns/coredns.yaml.base b/cluster/addons/dns/coredns/coredns.yaml.base
index e03559423e6..bda0ff6059f 100644
--- a/cluster/addons/dns/coredns/coredns.yaml.base
+++ b/cluster/addons/dns/coredns/coredns.yaml.base
@@ -145,7 +145,7 @@ spec:
limits:
memory: __DNS__MEMORY__LIMIT__
requests:
- cpu: 100m
+ cpu: 0
memory: 70Mi
args: [ "-conf", "/etc/coredns/Corefile" ]
volumeMounts:
diff --git a/cluster/addons/dns/coredns/coredns.yaml.in b/cluster/addons/dns/coredns/coredns.yaml.in
index 9b241370bea..e39d37e9e03 100644
--- a/cluster/addons/dns/coredns/coredns.yaml.in
+++ b/cluster/addons/dns/coredns/coredns.yaml.in
@@ -145,7 +145,7 @@ spec:
limits:
memory: 'dns_memory_limit'
requests:
- cpu: 100m
+ cpu: 0
memory: 70Mi
args: [ "-conf", "/etc/coredns/Corefile" ]
volumeMounts:
diff --git a/cluster/addons/dns/coredns/coredns.yaml.sed b/cluster/addons/dns/coredns/coredns.yaml.sed
index 561fdf9aea8..186cce37950 100644
--- a/cluster/addons/dns/coredns/coredns.yaml.sed
+++ b/cluster/addons/dns/coredns/coredns.yaml.sed
@@ -145,7 +145,7 @@ spec:
limits:
memory: $DNS_MEMORY_LIMIT
requests:
- cpu: 100m
+ cpu: 0
memory: 70Mi
args: [ "-conf", "/etc/coredns/Corefile" ]
volumeMounts:
diff --git a/cmd/kubeadm/app/phases/addons/dns/manifests.go b/cmd/kubeadm/app/phases/addons/dns/manifests.go
index 0e3c6c98c29..97c5ff96d43 100644
--- a/cmd/kubeadm/app/phases/addons/dns/manifests.go
+++ b/cmd/kubeadm/app/phases/addons/dns/manifests.go
@@ -104,7 +104,7 @@ spec:
limits:
memory: 170Mi
requests:
- cpu: 100m
+ cpu: 0
memory: 70Mi
args: [ "-conf", "/etc/coredns/Corefile" ]
volumeMounts:
diff --git a/cmd/kubeadm/app/phases/controlplane/manifests.go b/cmd/kubeadm/app/phases/controlplane/manifests.go
index 73f4fa56270..da52342a6f6 100644
--- a/cmd/kubeadm/app/phases/controlplane/manifests.go
+++ b/cmd/kubeadm/app/phases/controlplane/manifests.go
@@ -63,7 +63,7 @@ func GetStaticPodSpecs(cfg *kubeadmapi.ClusterConfiguration, endpoint *kubeadmap
LivenessProbe: staticpodutil.LivenessProbe(staticpodutil.GetAPIServerProbeAddress(endpoint), "/livez", int(endpoint.BindPort), v1.URISchemeHTTPS),
ReadinessProbe: staticpodutil.ReadinessProbe(staticpodutil.GetAPIServerProbeAddress(endpoint), "/readyz", int(endpoint.BindPort), v1.URISchemeHTTPS),
StartupProbe: staticpodutil.StartupProbe(staticpodutil.GetAPIServerProbeAddress(endpoint), "/livez", int(endpoint.BindPort), v1.URISchemeHTTPS, cfg.APIServer.TimeoutForControlPlane),
- Resources: staticpodutil.ComponentResources("250m"),
+ Resources: staticpodutil.ComponentResources("0"),
Env: kubeadmutil.GetProxyEnvVars(),
}, mounts.GetVolumes(kubeadmconstants.KubeAPIServer),
map[string]string{kubeadmconstants.KubeAPIServerAdvertiseAddressEndpointAnnotationKey: endpoint.String()}),
@@ -75,7 +75,7 @@ func GetStaticPodSpecs(cfg *kubeadmapi.ClusterConfiguration, endpoint *kubeadmap
VolumeMounts: staticpodutil.VolumeMountMapToSlice(mounts.GetVolumeMounts(kubeadmconstants.KubeControllerManager)),
LivenessProbe: staticpodutil.LivenessProbe(staticpodutil.GetControllerManagerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeControllerManagerPort, v1.URISchemeHTTPS),
StartupProbe: staticpodutil.StartupProbe(staticpodutil.GetControllerManagerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeControllerManagerPort, v1.URISchemeHTTPS, cfg.APIServer.TimeoutForControlPlane),
- Resources: staticpodutil.ComponentResources("200m"),
+ Resources: staticpodutil.ComponentResources("0"),
Env: kubeadmutil.GetProxyEnvVars(),
}, mounts.GetVolumes(kubeadmconstants.KubeControllerManager), nil),
kubeadmconstants.KubeScheduler: staticpodutil.ComponentPod(v1.Container{
@@ -86,7 +86,7 @@ func GetStaticPodSpecs(cfg *kubeadmapi.ClusterConfiguration, endpoint *kubeadmap
VolumeMounts: staticpodutil.VolumeMountMapToSlice(mounts.GetVolumeMounts(kubeadmconstants.KubeScheduler)),
LivenessProbe: staticpodutil.LivenessProbe(staticpodutil.GetSchedulerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeSchedulerPort, v1.URISchemeHTTPS),
StartupProbe: staticpodutil.StartupProbe(staticpodutil.GetSchedulerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeSchedulerPort, v1.URISchemeHTTPS, cfg.APIServer.TimeoutForControlPlane),
- Resources: staticpodutil.ComponentResources("100m"),
+ Resources: staticpodutil.ComponentResources("0"),
Env: kubeadmutil.GetProxyEnvVars(),
}, mounts.GetVolumes(kubeadmconstants.KubeScheduler), nil),
}
--
2.25.1

View File

@ -0,0 +1,30 @@
From 763da9f5ced5bb40cfc314e0b8199bcf46742f14 Mon Sep 17 00:00:00 2001
From: Boovan Rajendran <boovan.rajendran@windriver.com>
Date: Wed, 30 Nov 2022 04:17:19 -0500
Subject: [PATCH 10/10] kubelet CFS quota throttling for non integer cpulimit
Signed-off-by: Boovan Rajendran <boovan.rajendran@windriver.com>
---
pkg/kubelet/cm/internal_container_lifecycle_linux.go | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/pkg/kubelet/cm/internal_container_lifecycle_linux.go b/pkg/kubelet/cm/internal_container_lifecycle_linux.go
index 75406dd8564..05366ab6fcb 100644
--- a/pkg/kubelet/cm/internal_container_lifecycle_linux.go
+++ b/pkg/kubelet/cm/internal_container_lifecycle_linux.go
@@ -39,7 +39,11 @@ func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, contain
// Disable cgroup CFS throttle at the container level.
// /sys/fs/cgroup/cpu/k8s-infra/kubepods/<pod>/<container>/cpu.cfs_quota_us
// /sys/fs/cgroup/cpu/k8s-infra/kubepods/<pod>/<container>/cpu.cfs_period_us
- if i.cpuManager.GetCPUPolicy() == "static" && v1qos.GetPodQOS(pod) == v1.PodQOSGuaranteed {
+ // We can only set CpuQuota to -1 if we're allocating the entire CPU.
+ // For fractional CPUs the CpuQuota is needed to enforce the limit.
+ cpuQuantity := container.Resources.Requests[v1.ResourceCPU]
+ fractionalCpuQuantity := cpuQuantity.MilliValue()%1000
+ if i.cpuManager.GetCPUPolicy() == "static" && v1qos.GetPodQOS(pod) == v1.PodQOSGuaranteed && fractionalCpuQuantity == 0 {
containerConfig.Linux.Resources.CpuPeriod = int64(100000)
containerConfig.Linux.Resources.CpuQuota = int64(-1)
}
--
2.25.1

View File

@ -0,0 +1,255 @@
From 403a466bb82d8c285d6a2b814e467bb949cc9ca3 Mon Sep 17 00:00:00 2001
From: Sachin Gopala Krishna <saching.krishna@windriver.com>
Date: Mon, 3 Oct 2022 19:19:48 -0400
Subject: [PATCH 05/10] kubelet cpumanager disable CFS quota throttling
This disables CFS CPU quota to avoid performance degradation due to
Linux kernel CFS quota implementation. Note that 4.18 kernel attempts
to solve the CFS throttling problem, but there are reports that it is
not completely effective.
This disables CFS quota throttling for Guaranteed pods for both
parent and container cgroups by writing -1 to cgroup cpu.cfs_quota_us.
Disabling has a dramatic latency improvement for HTTP response times.
This patch is refactored in 1.22.5 due to new internal_container_lifecycle
framework. We leverage the same mechanism to set Linux resources as:
cpu manager: specify the container CPU set during the creation
Co-authored-by: Jim Gauld <james.gauld@windriver.com>
Signed-off-by: Sachin Gopala Krishna <saching.krishna@windriver.com>
---
pkg/kubelet/cm/cpumanager/cpu_manager.go | 7 +++
pkg/kubelet/cm/cpumanager/fake_cpu_manager.go | 10 ++++-
pkg/kubelet/cm/helpers_linux.go | 10 +++++
pkg/kubelet/cm/helpers_linux_test.go | 43 ++++++++++---------
.../cm/internal_container_lifecycle_linux.go | 9 ++++
5 files changed, 57 insertions(+), 22 deletions(-)
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go
index 443eecd2d36..9e2dce60501 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go
@@ -73,6 +73,9 @@ type Manager interface {
// State returns a read-only interface to the internal CPU manager state.
State() state.Reader
+ // GetCPUPolicy returns the assigned CPU manager policy
+ GetCPUPolicy() string
+
// GetTopologyHints implements the topologymanager.HintProvider Interface
// and is consulted to achieve NUMA aware resource alignment among this
// and other resource controllers.
@@ -315,6 +318,10 @@ func (m *manager) State() state.Reader {
return m.state
}
+func (m *manager) GetCPUPolicy() string {
+ return m.policy.Name()
+}
+
func (m *manager) GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
// The pod is during the admission phase. We need to save the pod to avoid it
// being cleaned before the admission ended
diff --git a/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go b/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go
index 93369705135..2e277da9c84 100644
--- a/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go
+++ b/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go
@@ -28,7 +28,8 @@ import (
)
type fakeManager struct {
- state state.State
+ policy Policy
+ state state.State
}
func (m *fakeManager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap) error {
@@ -70,6 +71,10 @@ func (m *fakeManager) State() state.Reader {
return m.state
}
+func (m *fakeManager) GetCPUPolicy() string {
+ return m.policy.Name()
+}
+
func (m *fakeManager) GetExclusiveCPUs(podUID, containerName string) cpuset.CPUSet {
klog.InfoS("GetExclusiveCPUs", "podUID", podUID, "containerName", containerName)
return cpuset.CPUSet{}
@@ -88,6 +93,7 @@ func (m *fakeManager) GetCPUAffinity(podUID, containerName string) cpuset.CPUSet
// NewFakeManager creates empty/fake cpu manager
func NewFakeManager() Manager {
return &fakeManager{
- state: state.NewMemoryState(),
+ policy: &nonePolicy{},
+ state: state.NewMemoryState(),
}
}
diff --git a/pkg/kubelet/cm/helpers_linux.go b/pkg/kubelet/cm/helpers_linux.go
index e0292496fe9..9a22bb2d312 100644
--- a/pkg/kubelet/cm/helpers_linux.go
+++ b/pkg/kubelet/cm/helpers_linux.go
@@ -186,6 +186,16 @@ func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64,
// build the result
result := &ResourceConfig{}
if qosClass == v1.PodQOSGuaranteed {
+ // Disable CFS CPU quota to avoid performance degradation due to
+ // Linux kernel CFS throttle implementation.
+ // NOTE: 4.18 kernel attempts to solve CFS throttling problem,
+ // but there are reports that it is not completely effective.
+ // This will configure cgroup CFS parameters at pod level:
+ // /sys/fs/cgroup/cpu/k8s-infra/kubepods/<pod>/cpu.cfs_quota_us
+ // /sys/fs/cgroup/cpu/k8s-infra/kubepods/<pod>/cpu.cfs_period_us
+ cpuQuota = int64(-1)
+ cpuPeriod = uint64(100000)
+
result.CpuShares = &cpuShares
result.CpuQuota = &cpuQuota
result.CpuPeriod = &cpuPeriod
diff --git a/pkg/kubelet/cm/helpers_linux_test.go b/pkg/kubelet/cm/helpers_linux_test.go
index 9296ea29e2a..08c5a92cd4c 100644
--- a/pkg/kubelet/cm/helpers_linux_test.go
+++ b/pkg/kubelet/cm/helpers_linux_test.go
@@ -64,8 +64,9 @@ func TestResourceConfigForPod(t *testing.T) {
burstablePartialShares := MilliCPUToShares(200)
burstableQuota := MilliCPUToQuota(200, int64(defaultQuotaPeriod))
guaranteedShares := MilliCPUToShares(100)
- guaranteedQuota := MilliCPUToQuota(100, int64(defaultQuotaPeriod))
- guaranteedTunedQuota := MilliCPUToQuota(100, int64(tunedQuotaPeriod))
+ guaranteedQuotaPeriod := uint64(100000)
+ guaranteedQuota := int64(-1)
+ guaranteedTunedQuota := int64(-1)
memoryQuantity = resource.MustParse("100Mi")
cpuNoLimit := int64(-1)
guaranteedMemory := memoryQuantity.Value()
@@ -204,8 +205,8 @@ func TestResourceConfigForPod(t *testing.T) {
},
},
enforceCPULimits: true,
- quotaPeriod: defaultQuotaPeriod,
- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedQuota, CpuPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory},
+ quotaPeriod: guaranteedQuotaPeriod,
+ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedQuota, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
},
"guaranteed-no-cpu-enforcement": {
pod: &v1.Pod{
@@ -218,8 +219,8 @@ func TestResourceConfigForPod(t *testing.T) {
},
},
enforceCPULimits: false,
- quotaPeriod: defaultQuotaPeriod,
- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory},
+ quotaPeriod: guaranteedQuotaPeriod,
+ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
},
"guaranteed-with-tuned-quota": {
pod: &v1.Pod{
@@ -232,8 +233,8 @@ func TestResourceConfigForPod(t *testing.T) {
},
},
enforceCPULimits: true,
- quotaPeriod: tunedQuotaPeriod,
- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedTunedQuota, CpuPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory},
+ quotaPeriod: guaranteedQuotaPeriod,
+ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedTunedQuota, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
},
"guaranteed-no-cpu-enforcement-with-tuned-quota": {
pod: &v1.Pod{
@@ -246,8 +247,8 @@ func TestResourceConfigForPod(t *testing.T) {
},
},
enforceCPULimits: false,
- quotaPeriod: tunedQuotaPeriod,
- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory},
+ quotaPeriod: guaranteedQuotaPeriod,
+ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
},
"burstable-partial-limits-with-init-containers": {
pod: &v1.Pod{
@@ -309,8 +310,10 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) {
burstablePartialShares := MilliCPUToShares(200)
burstableQuota := MilliCPUToQuota(200, int64(defaultQuotaPeriod))
guaranteedShares := MilliCPUToShares(100)
- guaranteedQuota := MilliCPUToQuota(100, int64(defaultQuotaPeriod))
- guaranteedTunedQuota := MilliCPUToQuota(100, int64(tunedQuotaPeriod))
+ guaranteedQuotaPeriod := uint64(100000)
+ guaranteedQuota := int64(-1)
+ guaranteedTunedQuota := int64(-1)
+
memoryQuantity = resource.MustParse("100Mi")
cpuNoLimit := int64(-1)
guaranteedMemory := memoryQuantity.Value()
@@ -449,8 +452,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) {
},
},
enforceCPULimits: true,
- quotaPeriod: defaultQuotaPeriod,
- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedQuota, CpuPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory},
+ quotaPeriod: guaranteedQuotaPeriod,
+ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedQuota, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
},
"guaranteed-no-cpu-enforcement": {
pod: &v1.Pod{
@@ -463,8 +466,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) {
},
},
enforceCPULimits: false,
- quotaPeriod: defaultQuotaPeriod,
- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory},
+ quotaPeriod: guaranteedQuotaPeriod,
+ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
},
"guaranteed-with-tuned-quota": {
pod: &v1.Pod{
@@ -477,8 +480,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) {
},
},
enforceCPULimits: true,
- quotaPeriod: tunedQuotaPeriod,
- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedTunedQuota, CpuPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory},
+ quotaPeriod: guaranteedQuotaPeriod,
+ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedTunedQuota, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
},
"guaranteed-no-cpu-enforcement-with-tuned-quota": {
pod: &v1.Pod{
@@ -491,8 +494,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) {
},
},
enforceCPULimits: false,
- quotaPeriod: tunedQuotaPeriod,
- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory},
+ quotaPeriod: guaranteedQuotaPeriod,
+ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
},
}
diff --git a/pkg/kubelet/cm/internal_container_lifecycle_linux.go b/pkg/kubelet/cm/internal_container_lifecycle_linux.go
index cb7c0cfa543..75406dd8564 100644
--- a/pkg/kubelet/cm/internal_container_lifecycle_linux.go
+++ b/pkg/kubelet/cm/internal_container_lifecycle_linux.go
@@ -25,6 +25,7 @@ import (
"k8s.io/api/core/v1"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
+ v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
)
func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, container *v1.Container, containerConfig *runtimeapi.ContainerConfig) error {
@@ -35,6 +36,14 @@ func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, contain
}
}
+ // Disable cgroup CFS throttle at the container level.
+ // /sys/fs/cgroup/cpu/k8s-infra/kubepods/<pod>/<container>/cpu.cfs_quota_us
+ // /sys/fs/cgroup/cpu/k8s-infra/kubepods/<pod>/<container>/cpu.cfs_period_us
+ if i.cpuManager.GetCPUPolicy() == "static" && v1qos.GetPodQOS(pod) == v1.PodQOSGuaranteed {
+ containerConfig.Linux.Resources.CpuPeriod = int64(100000)
+ containerConfig.Linux.Resources.CpuQuota = int64(-1)
+ }
+
if i.memoryManager != nil {
numaNodes := i.memoryManager.GetMemoryNUMANodes(pod, container)
if numaNodes.Len() > 0 {
--
2.25.1

View File

@ -0,0 +1,166 @@
From 6c9e22271647302c86578243de6d124ede78d829 Mon Sep 17 00:00:00 2001
From: Ramesh Kumar Sivanandam <rameshkumar.sivanandam@windriver.com>
Date: Mon, 7 Nov 2022 09:48:01 -0500
Subject: [PATCH 07/10] kubelet cpumanager infra pods use system reserved CPUs
This assigns system infrastructure pods to the "reserved" cpuset
to isolate them from the shared pool of CPUs.
Infrastructure pods include any pods that belong to the kube-system,
armada, cert-manager, vault, platform-deployment-manager, portieris,
notification, flux-helm or metrics-server namespaces.
The implementation is a bit simplistic, it is assumed that the
"reserved" cpuset is large enough to handle all infrastructure pods
CPU allocations.
This also prevents infrastucture pods from using Guaranteed resources.
Co-authored-by: Jim Gauld <james.gauld@windriver.com>
Signed-off-by: Gleb Aronsky <gleb.aronsky@windriver.com>
Signed-off-by: Thiago Miranda <ThiagoOliveira.Miranda@windriver.com>
Signed-off-by: Kaustubh Dhokte <kaustubh.dhokte@windriver.com>
Signed-off-by: Ramesh Kumar Sivanandam <rameshkumar.sivanandam@windriver.com>
Signed-off-by: Sachin Gopala Krishna <saching.krishna@windriver.com>
---
pkg/kubelet/cm/cpumanager/policy_static.go | 50 ++++++++++++++++---
.../cm/cpumanager/policy_static_test.go | 19 ++++++-
2 files changed, 62 insertions(+), 7 deletions(-)
diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go
index 4c4164a9099..180d018565c 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static.go
@@ -56,6 +56,11 @@ func (e SMTAlignmentError) Type() string {
return ErrorSMTAlignment
}
+// Define namespaces used by platform infrastructure pods
+var infraNamespaces = [...]string{
+ "kube-system", "armada", "cert-manager", "platform-deployment-manager", "portieris", "vault", "notification", "flux-helm", "metrics-server",
+}
+
// staticPolicy is a CPU manager policy that does not change CPU
// assignments for exclusively pinned guaranteed containers after the main
// container process starts.
@@ -128,11 +133,11 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv
klog.InfoS("Static policy created with configuration", "options", opts)
policy := &staticPolicy{
- topology: topology,
- affinity: affinity,
+ topology: topology,
+ affinity: affinity,
excludeReserved: excludeReserved,
- cpusToReuse: make(map[string]cpuset.CPUSet),
- options: opts,
+ cpusToReuse: make(map[string]cpuset.CPUSet),
+ options: opts,
}
allCPUs := topology.CPUDetails.CPUs()
@@ -200,8 +205,8 @@ func (p *staticPolicy) validateState(s state.State) error {
// - user tampered with file
if !p.excludeReserved {
if !p.reserved.Intersection(tmpDefaultCPUset).Equals(p.reserved) {
- return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"",
- p.reserved.String(), tmpDefaultCPUset.String())
+ return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"",
+ p.reserved.String(), tmpDefaultCPUset.String())
}
}
// 2. Check if state for static policy is consistent
@@ -276,6 +281,25 @@ func (p *staticPolicy) updateCPUsToReuse(pod *v1.Pod, container *v1.Container, c
}
func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) (rerr error) {
+ // Process infra pods before guaranteed pods
+ if isKubeInfra(pod) {
+ // Container belongs in reserved pool.
+ // We don't want to fall through to the p.guaranteedCPUs() clause below so return either nil or error.
+ if _, ok := s.GetCPUSet(string(pod.UID), container.Name); ok {
+ klog.Infof("[cpumanager] static policy: reserved container already present in state, skipping (namespace: %s, pod UID: %s, pod: %s, container: %s)", pod.Namespace, string(pod.UID), pod.Name, container.Name)
+ return nil
+ }
+
+ cpuset := p.reserved
+ if cpuset.IsEmpty() {
+ // If this happens then someone messed up.
+ return fmt.Errorf("[cpumanager] static policy: reserved container unable to allocate cpus (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v, reserved:%v", pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset, p.reserved)
+ }
+ s.SetCPUSet(string(pod.UID), container.Name, cpuset)
+ klog.Infof("[cpumanager] static policy: reserved: AddContainer (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v", pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset)
+ return nil
+ }
+
numCPUs := p.guaranteedCPUs(pod, container)
if numCPUs == 0 {
// container belongs in the shared pool (nothing to do; use default cpuset)
@@ -401,6 +425,10 @@ func (p *staticPolicy) guaranteedCPUs(pod *v1.Pod, container *v1.Container) int
if cpuQuantity.Value()*1000 != cpuQuantity.MilliValue() {
return 0
}
+ // Infrastructure pods use reserved CPUs even if they're in the Guaranteed QoS class
+ if isKubeInfra(pod) {
+ return 0
+ }
// Safe downcast to do for all systems with < 2.1 billion CPUs.
// Per the language spec, `int` is guaranteed to be at least 32 bits wide.
// https://golang.org/ref/spec#Numeric_types
@@ -619,6 +647,16 @@ func (p *staticPolicy) generateCPUTopologyHints(availableCPUs cpuset.CPUSet, reu
return hints
}
+// check if a given pod is in a platform infrastructure namespace
+func isKubeInfra(pod *v1.Pod) bool {
+ for _, namespace := range infraNamespaces {
+ if namespace == pod.Namespace {
+ return true
+ }
+ }
+ return false
+}
+
// isHintSocketAligned function return true if numa nodes in hint are socket aligned.
func (p *staticPolicy) isHintSocketAligned(hint topologymanager.TopologyHint, minAffinitySize int) bool {
numaNodesBitMask := hint.NUMANodeAffinity.GetBits()
diff --git a/pkg/kubelet/cm/cpumanager/policy_static_test.go b/pkg/kubelet/cm/cpumanager/policy_static_test.go
index 80a0c5a9e70..414e5ce144c 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static_test.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static_test.go
@@ -939,7 +939,8 @@ func TestStaticPolicyStartWithResvList(t *testing.T) {
}
func TestStaticPolicyAddWithResvList(t *testing.T) {
-
+ infraPod := makePod("fakePod", "fakeContainer2", "200m", "200m")
+ infraPod.Namespace = "kube-system"
testCases := []staticPolicyTestWithResvList{
{
description: "GuPodSingleCore, SingleSocketHT, ExpectError",
@@ -981,6 +982,22 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
expCPUAlloc: true,
expCSet: cpuset.NewCPUSet(4, 5),
},
+ {
+ description: "InfraPod, SingleSocketHT, ExpectAllocReserved",
+ topo: topoSingleSocketHT,
+ numReservedCPUs: 2,
+ reserved: cpuset.NewCPUSet(0, 1),
+ stAssignments: state.ContainerCPUAssignments{
+ "fakePod": map[string]cpuset.CPUSet{
+ "fakeContainer100": cpuset.NewCPUSet(2, 3, 6, 7),
+ },
+ },
+ stDefaultCPUSet: cpuset.NewCPUSet(4, 5),
+ pod: infraPod,
+ expErr: nil,
+ expCPUAlloc: true,
+ expCSet: cpuset.NewCPUSet(0, 1),
+ },
}
testExcl := true
--
2.25.1

View File

@ -0,0 +1,744 @@
From ed1f8c6a04e7fed096eaae5081c2b5e0c3bc6fed Mon Sep 17 00:00:00 2001
From: Ramesh Kumar Sivanandam <rameshkumar.sivanandam@windriver.com>
Date: Mon, 7 Nov 2022 13:33:03 -0500
Subject: [PATCH 08/10] kubelet cpumanager introduce concept of isolated CPUs
This introduces the concept of "isolated CPUs", which are CPUs that
have been isolated at the kernel level via the "isolcpus" kernel boot
parameter.
When starting the kubelet process, two separate sets of reserved CPUs
may be specified. With this change CPUs reserved via
'--system-reserved=cpu' will be used for infrastructure pods while the
isolated CPUs should be reserved via '--kube-reserved=cpu' to cause
kubelet to skip over them for "normal" CPU resource tracking. The
kubelet code will double-check that the specified isolated CPUs match
what the kernel exposes in "/sys/devices/system/cpu/isolated".
A plugin (outside the scope of this commit) will expose the isolated
CPUs to kubelet via the device plugin API.
If a pod specifies some number of "isolcpus" resources, the device
manager will allocate them. In this code we check whether such
resources have been allocated, and if so we set the container cpuset to
the isolated CPUs. This does mean that it really only makes sense to
specify "isolcpus" resources for best-effort or burstable pods, not for
guaranteed ones since that would throw off the accounting code. In
order to ensure the accounting still works as designed, if "isolcpus"
are specified for guaranteed pods, the affinity will be set to the
non-isolated CPUs.
This patch was refactored in 1.21.3 due to upstream API change
node: podresources: make GetDevices() consistent
(commit ad68f9588c72d6477b5a290c548a9031063ac659).
The routine podIsolCPUs() was refactored in 1.21.3 since the API
p.deviceManager.GetDevices() is returning multiple devices with
a device per cpu. The resultant cpuset needs to be the aggregate.
The routine NewStaticPolicy was refactored in 1.22.5, adding a new argument
in its signature: cpuPolicyOptions map[string]string. This change is implies
shifting the new arguments(deviceManager, excludeReserved) with one position
to the right.
Co-authored-by: Jim Gauld <james.gauld@windriver.com>
Co-authored-by: Chris Friesen <chris.friesen@windriver.com>
Signed-off-by: Gleb Aronsky <gleb.aronsky@windriver.com>
Signed-off-by: Ramesh Kumar Sivanandam <rameshkumar.sivanandam@windriver.com>
Signed-off-by: Sachin Gopala Krishna <saching.krishna@windriver.com>
---
pkg/kubelet/cm/container_manager_linux.go | 1 +
pkg/kubelet/cm/cpumanager/cpu_manager.go | 35 ++++++-
pkg/kubelet/cm/cpumanager/cpu_manager_test.go | 23 ++++-
pkg/kubelet/cm/cpumanager/policy_static.go | 83 ++++++++++++++--
.../cm/cpumanager/policy_static_test.go | 53 ++++++++--
pkg/kubelet/cm/devicemanager/manager_stub.go | 99 +++++++++++++++++++
6 files changed, 273 insertions(+), 21 deletions(-)
create mode 100644 pkg/kubelet/cm/devicemanager/manager_stub.go
diff --git a/pkg/kubelet/cm/container_manager_linux.go b/pkg/kubelet/cm/container_manager_linux.go
index 332e99e72ee..5ac571d9abd 100644
--- a/pkg/kubelet/cm/container_manager_linux.go
+++ b/pkg/kubelet/cm/container_manager_linux.go
@@ -331,6 +331,7 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I
cm.GetNodeAllocatableReservation(),
nodeConfig.KubeletRootDir,
cm.topologyManager,
+ cm.deviceManager,
)
if err != nil {
klog.ErrorS(err, "Failed to initialize cpu manager")
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go
index e2c89efeb2e..95a4246e840 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go
@@ -19,7 +19,9 @@ package cpumanager
import (
"context"
"fmt"
+ "io/ioutil"
"math"
+ "strings"
"sync"
"time"
@@ -33,6 +35,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
+ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
@@ -51,6 +54,25 @@ type policyName string
// cpuManagerStateFileName is the file name where cpu manager stores its state
const cpuManagerStateFileName = "cpu_manager_state"
+// get the system-level isolated CPUs
+func getIsolcpus() cpuset.CPUSet {
+ dat, err := ioutil.ReadFile("/sys/devices/system/cpu/isolated")
+ if err != nil {
+ klog.Errorf("[cpumanager] unable to read sysfs isolcpus subdir")
+ return cpuset.NewCPUSet()
+ }
+
+ // The isolated cpus string ends in a newline
+ cpustring := strings.TrimSuffix(string(dat), "\n")
+ cset, err := cpuset.Parse(cpustring)
+ if err != nil {
+ klog.Errorf("[cpumanager] unable to parse sysfs isolcpus string to cpuset")
+ return cpuset.NewCPUSet()
+ }
+
+ return cset
+}
+
// Manager interface provides methods for Kubelet to manage pod cpus.
type Manager interface {
// Start is called during Kubelet initialization.
@@ -154,7 +176,8 @@ func (s *sourcesReadyStub) AddSource(source string) {}
func (s *sourcesReadyStub) AllReady() bool { return true }
// NewManager creates new cpu manager based on provided policy
-func NewManager(cpuPolicyName string, cpuPolicyOptions map[string]string, reconcilePeriod time.Duration, machineInfo *cadvisorapi.MachineInfo, specificCPUs cpuset.CPUSet, nodeAllocatableReservation v1.ResourceList, stateFileDirectory string, affinity topologymanager.Store) (Manager, error) {
+func NewManager(cpuPolicyName string, cpuPolicyOptions map[string]string, reconcilePeriod time.Duration, machineInfo *cadvisorapi.MachineInfo, specificCPUs cpuset.CPUSet, nodeAllocatableReservation v1.ResourceList, stateFileDirectory string, affinity topologymanager.Store, deviceManager devicemanager.Manager) (Manager, error) {
+
var topo *topology.CPUTopology
var policy Policy
var err error
@@ -195,7 +218,15 @@ func NewManager(cpuPolicyName string, cpuPolicyOptions map[string]string, reconc
// NOTE: Set excludeReserved unconditionally to exclude reserved CPUs from default cpuset.
// This variable is primarily to make testing easier.
excludeReserved := true
- policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, affinity, cpuPolicyOptions, excludeReserved)
+
+ // isolCPUs is the set of kernel-isolated CPUs. They should be a subset of specificCPUs or
+ // of the CPUs that NewStaticPolicy() will pick if numReservedCPUs is set. It's only in the
+ // argument list here for ease of testing, it's really internal to the policy.
+ isolCPUs := getIsolcpus()
+ policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, isolCPUs, affinity, cpuPolicyOptions, deviceManager, excludeReserved)
+ if err != nil {
+ return nil, fmt.Errorf("new static policy error: %v", err)
+ }
if err != nil {
return nil, fmt.Errorf("new static policy error: %w", err)
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
index e7c74453472..78b4ada1a73 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
@@ -37,6 +37,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
+ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
)
@@ -215,6 +216,7 @@ func makeMultiContainerPod(initCPUs, appCPUs []struct{ request, limit string })
}
func TestCPUManagerAdd(t *testing.T) {
+ testDM, _ := devicemanager.NewManagerStub()
testExcl := false
testPolicy, _ := NewStaticPolicy(
&topology.CPUTopology{
@@ -230,8 +232,10 @@ func TestCPUManagerAdd(t *testing.T) {
},
0,
cpuset.NewCPUSet(),
+ cpuset.NewCPUSet(),
topologymanager.NewFakeManager(),
nil,
+ testDM,
testExcl)
testCases := []struct {
description string
@@ -482,8 +486,9 @@ func TestCPUManagerAddWithInitContainers(t *testing.T) {
}
testExcl := false
+ testDM, _ := devicemanager.NewManagerStub()
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testExcl)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testDM, testExcl)
mockState := &mockState{
assignments: testCase.stAssignments,
@@ -638,7 +643,9 @@ func TestCPUManagerGenerate(t *testing.T) {
}
defer os.RemoveAll(sDir)
- mgr, err := NewManager(testCase.cpuPolicyName, nil, 5*time.Second, machineInfo, cpuset.NewCPUSet(), testCase.nodeAllocatableReservation, sDir, topologymanager.NewFakeManager())
+ testDM, err := devicemanager.NewManagerStub()
+ mgr, err := NewManager(testCase.cpuPolicyName, nil, 5*time.Second, machineInfo, cpuset.NewCPUSet(), testCase.nodeAllocatableReservation, sDir, topologymanager.NewFakeManager(), testDM)
+
if testCase.expectedError != nil {
if !strings.Contains(err.Error(), testCase.expectedError.Error()) {
t.Errorf("Unexpected error message. Have: %s wants %s", err.Error(), testCase.expectedError.Error())
@@ -709,6 +716,7 @@ func TestCPUManagerRemove(t *testing.T) {
func TestReconcileState(t *testing.T) {
testExcl := false
+ testDM, _ := devicemanager.NewManagerStub()
testPolicy, _ := NewStaticPolicy(
&topology.CPUTopology{
NumCPUs: 8,
@@ -727,8 +735,10 @@ func TestReconcileState(t *testing.T) {
},
0,
cpuset.NewCPUSet(),
+ cpuset.NewCPUSet(),
topologymanager.NewFakeManager(),
nil,
+ testDM,
testExcl)
testCases := []struct {
@@ -1234,6 +1244,7 @@ func TestReconcileState(t *testing.T) {
// the following tests are with --reserved-cpus configured
func TestCPUManagerAddWithResvList(t *testing.T) {
testExcl := false
+ testDM, _ := devicemanager.NewManagerStub()
testPolicy, _ := NewStaticPolicy(
&topology.CPUTopology{
NumCPUs: 4,
@@ -1248,8 +1259,10 @@ func TestCPUManagerAddWithResvList(t *testing.T) {
},
1,
cpuset.NewCPUSet(0),
+ cpuset.NewCPUSet(),
topologymanager.NewFakeManager(),
nil,
+ testDM,
testExcl)
testCases := []struct {
description string
@@ -1362,7 +1375,8 @@ func TestCPUManagerHandlePolicyOptions(t *testing.T) {
}
defer os.RemoveAll(sDir)
- _, err = NewManager(testCase.cpuPolicyName, testCase.cpuPolicyOptions, 5*time.Second, machineInfo, cpuset.NewCPUSet(), nodeAllocatableReservation, sDir, topologymanager.NewFakeManager())
+ testDM, err := devicemanager.NewManagerStub()
+ _, err = NewManager(testCase.cpuPolicyName, testCase.cpuPolicyOptions, 5*time.Second, machineInfo, cpuset.NewCPUSet(), nodeAllocatableReservation, sDir, topologymanager.NewFakeManager(), testDM)
if err == nil {
t.Errorf("Expected error, but NewManager succeeded")
}
@@ -1376,6 +1390,7 @@ func TestCPUManagerHandlePolicyOptions(t *testing.T) {
func TestCPUManagerGetAllocatableCPUs(t *testing.T) {
testExcl := false
+ testDm, _ := devicemanager.NewManagerStub()
nonePolicy, _ := NewNonePolicy(nil)
staticPolicy, _ := NewStaticPolicy(
&topology.CPUTopology{
@@ -1391,8 +1406,10 @@ func TestCPUManagerGetAllocatableCPUs(t *testing.T) {
},
1,
cpuset.NewCPUSet(0),
+ cpuset.NewCPUSet(),
topologymanager.NewFakeManager(),
nil,
+ testDm,
testExcl)
testCases := []struct {
diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go
index 180d018565c..8d18ce65309 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static.go
@@ -18,6 +18,7 @@ package cpumanager
import (
"fmt"
+ "strconv"
v1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
@@ -26,6 +27,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
+ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
"k8s.io/kubernetes/pkg/kubelet/metrics"
@@ -104,6 +106,10 @@ type staticPolicy struct {
topology *topology.CPUTopology
// set of CPUs that is not available for exclusive assignment
reserved cpuset.CPUSet
+ // subset of reserved CPUs with isolcpus attribute
+ isolcpus cpuset.CPUSet
+ // parent containerManager, used to get device list
+ deviceManager devicemanager.Manager
// If true, default CPUSet should exclude reserved CPUs
excludeReserved bool
// topology manager reference to get container Topology affinity
@@ -120,7 +126,8 @@ var _ Policy = &staticPolicy{}
// NewStaticPolicy returns a CPU manager policy that does not change CPU
// assignments for exclusively pinned guaranteed containers after the main
// container process starts.
-func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store, cpuPolicyOptions map[string]string, excludeReserved bool) (Policy, error) {
+func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, isolCPUs cpuset.CPUSet, affinity topologymanager.Store, cpuPolicyOptions map[string]string, deviceManager devicemanager.Manager, excludeReserved bool) (Policy, error) {
+
opts, err := NewStaticPolicyOptions(cpuPolicyOptions)
if err != nil {
return nil, err
@@ -135,6 +142,8 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv
policy := &staticPolicy{
topology: topology,
affinity: affinity,
+ isolcpus: isolCPUs,
+ deviceManager: deviceManager,
excludeReserved: excludeReserved,
cpusToReuse: make(map[string]cpuset.CPUSet),
options: opts,
@@ -161,6 +170,12 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv
klog.InfoS("Reserved CPUs not available for exclusive assignment", "reservedSize", reserved.Size(), "reserved", reserved)
policy.reserved = reserved
+ if !isolCPUs.IsSubsetOf(reserved) {
+ klog.Errorf("[cpumanager] isolCPUs %v is not a subset of reserved %v", isolCPUs, reserved)
+ reserved = reserved.Union(isolCPUs)
+ klog.Warningf("[cpumanager] mismatch isolCPUs %v, force reserved %v", isolCPUs, reserved)
+ }
+
return policy, nil
}
@@ -194,8 +209,9 @@ func (p *staticPolicy) validateState(s state.State) error {
} else {
s.SetDefaultCPUSet(allCPUs)
}
- klog.Infof("[cpumanager] static policy: CPUSet: allCPUs:%v, reserved:%v, default:%v\n",
- allCPUs, p.reserved, s.GetDefaultCPUSet())
+ klog.Infof("[cpumanager] static policy: CPUSet: allCPUs:%v, reserved:%v, isolcpus:%v, default:%v\n",
+ allCPUs, p.reserved, p.isolcpus, s.GetDefaultCPUSet())
+
return nil
}
@@ -290,16 +306,39 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
return nil
}
- cpuset := p.reserved
+ cpuset := p.reserved.Clone().Difference(p.isolcpus)
if cpuset.IsEmpty() {
// If this happens then someone messed up.
- return fmt.Errorf("[cpumanager] static policy: reserved container unable to allocate cpus (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v, reserved:%v", pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset, p.reserved)
+ return fmt.Errorf("[cpumanager] static policy: reserved container unable to allocate cpus (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v, reserved:%v, isolcpus:%v", pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset, p.reserved, p.isolcpus)
+
}
s.SetCPUSet(string(pod.UID), container.Name, cpuset)
klog.Infof("[cpumanager] static policy: reserved: AddContainer (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v", pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset)
return nil
}
+ if isolcpus := p.podIsolCPUs(pod, container); isolcpus.Size() > 0 {
+ // container has requested isolated CPUs
+ if set, ok := s.GetCPUSet(string(pod.UID), container.Name); ok {
+ if set.Equals(isolcpus) {
+ klog.Infof("[cpumanager] isolcpus container already present in state, skipping (namespace: %s, pod UID: %s, pod: %s, container: %s)",
+ pod.Namespace, string(pod.UID), pod.Name, container.Name)
+ return nil
+ } else {
+ klog.Infof("[cpumanager] isolcpus container state has cpus %v, should be %v (namespace: %s, pod UID: %s, pod: %s, container: %s)",
+ isolcpus, set, pod.Namespace, string(pod.UID), pod.Name, container.Name)
+ }
+ }
+ // Note that we do not do anything about init containers here.
+ // It looks like devices are allocated per-pod based on effective requests/limits
+ // and extra devices from initContainers are not freed up when the regular containers start.
+ // TODO: confirm this is still true for 1.20
+ s.SetCPUSet(string(pod.UID), container.Name, isolcpus)
+ klog.Infof("[cpumanager] isolcpus: AddContainer (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v",
+ pod.Namespace, string(pod.UID), pod.Name, container.Name, isolcpus)
+ return nil
+ }
+
numCPUs := p.guaranteedCPUs(pod, container)
if numCPUs == 0 {
// container belongs in the shared pool (nothing to do; use default cpuset)
@@ -348,7 +387,9 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
}
s.SetCPUSet(string(pod.UID), container.Name, cpuset)
p.updateCPUsToReuse(pod, container, cpuset)
-
+ klog.Infof("[cpumanager] guaranteed: AddContainer "+
+ "(namespace: %s, pod UID: %s, pod: %s, container: %s); numCPUS=%d, cpuset=%v",
+ pod.Namespace, string(pod.UID), pod.Name, container.Name, numCPUs, cpuset)
return nil
}
@@ -657,6 +698,36 @@ func isKubeInfra(pod *v1.Pod) bool {
return false
}
+// get the isolated CPUs (if any) from the devices associated with a specific container
+func (p *staticPolicy) podIsolCPUs(pod *v1.Pod, container *v1.Container) cpuset.CPUSet {
+ // NOTE: This is required for TestStaticPolicyAdd() since makePod() does
+ // not create UID. We also need a way to properly stub devicemanager.
+ if len(string(pod.UID)) == 0 {
+ return cpuset.NewCPUSet()
+ }
+ resContDevices := p.deviceManager.GetDevices(string(pod.UID), container.Name)
+ cpuSet := cpuset.NewCPUSet()
+ for resourceName, resourceDevs := range resContDevices {
+ // this resource name needs to match the isolcpus device plugin
+ if resourceName == "windriver.com/isolcpus" {
+ for devID, _ := range resourceDevs {
+ cpuStrList := []string{devID}
+ if len(cpuStrList) > 0 {
+ // loop over the list of strings, convert each one to int, add to cpuset
+ for _, cpuStr := range cpuStrList {
+ cpu, err := strconv.Atoi(cpuStr)
+ if err != nil {
+ panic(err)
+ }
+ cpuSet = cpuSet.Union(cpuset.NewCPUSet(cpu))
+ }
+ }
+ }
+ }
+ }
+ return cpuSet
+}
+
// isHintSocketAligned function return true if numa nodes in hint are socket aligned.
func (p *staticPolicy) isHintSocketAligned(hint topologymanager.TopologyHint, minAffinitySize int) bool {
numaNodesBitMask := hint.NUMANodeAffinity.GetBits()
diff --git a/pkg/kubelet/cm/cpumanager/policy_static_test.go b/pkg/kubelet/cm/cpumanager/policy_static_test.go
index 414e5ce144c..1c43df3b85f 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static_test.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static_test.go
@@ -28,6 +28,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
+ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
)
@@ -69,8 +70,9 @@ func (spt staticPolicyTest) PseudoClone() staticPolicyTest {
}
func TestStaticPolicyName(t *testing.T) {
+ testDM, _ := devicemanager.NewManagerStub()
testExcl := false
- policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testExcl)
+ policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.NewCPUSet(), cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testDM, testExcl)
policyName := policy.Name()
if policyName != "static" {
@@ -80,6 +82,7 @@ func TestStaticPolicyName(t *testing.T) {
}
func TestStaticPolicyStart(t *testing.T) {
+ testDM, _ := devicemanager.NewManagerStub()
testCases := []staticPolicyTest{
{
description: "non-corrupted state",
@@ -155,7 +158,7 @@ func TestStaticPolicyStart(t *testing.T) {
}
for _, testCase := range testCases {
t.Run(testCase.description, func(t *testing.T) {
- p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testCase.excludeReserved)
+ p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testDM, testCase.excludeReserved)
policy := p.(*staticPolicy)
st := &mockState{
@@ -203,7 +206,6 @@ func TestStaticPolicyAdd(t *testing.T) {
largeTopoCPUSet := largeTopoBuilder.Result()
largeTopoSock0CPUSet := largeTopoSock0Builder.Result()
largeTopoSock1CPUSet := largeTopoSock1Builder.Result()
-
// these are the cases which must behave the same regardless the policy options.
// So we will permutate the options to ensure this holds true.
@@ -577,8 +579,10 @@ func runStaticPolicyTestCase(t *testing.T, testCase staticPolicyTest) {
if testCase.topologyHint != nil {
tm = topologymanager.NewFakeManagerWithHint(testCase.topologyHint)
}
+ testDM, _ := devicemanager.NewManagerStub()
testExcl := false
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), tm, testCase.options, testExcl)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), cpuset.NewCPUSet(), tm, testCase.options, testDM, testExcl)
+
st := &mockState{
assignments: testCase.stAssignments,
@@ -625,6 +629,8 @@ func runStaticPolicyTestCaseWithFeatureGate(t *testing.T, testCase staticPolicyT
}
func TestStaticPolicyReuseCPUs(t *testing.T) {
+ excludeReserved := false
+ testDM, _ := devicemanager.NewManagerStub()
testCases := []struct {
staticPolicyTest
expCSetAfterAlloc cpuset.CPUSet
@@ -649,7 +655,7 @@ func TestStaticPolicyReuseCPUs(t *testing.T) {
}
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testCase.excludeReserved)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testDM, excludeReserved)
st := &mockState{
assignments: testCase.stAssignments,
@@ -682,6 +688,7 @@ func TestStaticPolicyReuseCPUs(t *testing.T) {
func TestStaticPolicyRemove(t *testing.T) {
excludeReserved := false
+ testDM, _ := devicemanager.NewManagerStub()
testCases := []staticPolicyTest{
{
description: "SingleSocketHT, DeAllocOneContainer",
@@ -740,7 +747,7 @@ func TestStaticPolicyRemove(t *testing.T) {
}
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, excludeReserved)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testDM, excludeReserved)
st := &mockState{
assignments: testCase.stAssignments,
@@ -763,6 +770,7 @@ func TestStaticPolicyRemove(t *testing.T) {
func TestTopologyAwareAllocateCPUs(t *testing.T) {
excludeReserved := false
+ testDM, _ := devicemanager.NewManagerStub()
testCases := []struct {
description string
topo *topology.CPUTopology
@@ -831,7 +839,8 @@ func TestTopologyAwareAllocateCPUs(t *testing.T) {
},
}
for _, tc := range testCases {
- p, _ := NewStaticPolicy(tc.topo, 0, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, excludeReserved)
+ p, _ := NewStaticPolicy(tc.topo, 0, cpuset.NewCPUSet(), cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testDM, excludeReserved)
+
policy := p.(*staticPolicy)
st := &mockState{
assignments: tc.stAssignments,
@@ -864,6 +873,7 @@ type staticPolicyTestWithResvList struct {
topo *topology.CPUTopology
numReservedCPUs int
reserved cpuset.CPUSet
+ isolcpus cpuset.CPUSet
stAssignments state.ContainerCPUAssignments
stDefaultCPUSet cpuset.CPUSet
pod *v1.Pod
@@ -874,6 +884,8 @@ type staticPolicyTestWithResvList struct {
}
func TestStaticPolicyStartWithResvList(t *testing.T) {
+ testDM, _ := devicemanager.NewManagerStub()
+ testExcl := false
testCases := []staticPolicyTestWithResvList{
{
description: "empty cpuset",
@@ -903,11 +915,10 @@ func TestStaticPolicyStartWithResvList(t *testing.T) {
expNewErr: fmt.Errorf("[cpumanager] unable to reserve the required amount of CPUs (size of 0-1 did not equal 1)"),
},
}
- testExcl := false
for _, testCase := range testCases {
t.Run(testCase.description, func(t *testing.T) {
- p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil, testExcl)
+ p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testDM, testExcl)
if !reflect.DeepEqual(err, testCase.expNewErr) {
t.Errorf("StaticPolicy Start() error (%v). expected error: %v but got: %v",
testCase.description, testCase.expNewErr, err)
@@ -947,6 +958,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
topo: topoSingleSocketHT,
numReservedCPUs: 1,
reserved: cpuset.NewCPUSet(0),
+ isolcpus: cpuset.NewCPUSet(),
stAssignments: state.ContainerCPUAssignments{},
stDefaultCPUSet: cpuset.NewCPUSet(1, 2, 3, 4, 5, 6, 7),
pod: makePod("fakePod", "fakeContainer2", "8000m", "8000m"),
@@ -959,6 +971,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
topo: topoSingleSocketHT,
numReservedCPUs: 2,
reserved: cpuset.NewCPUSet(0, 1),
+ isolcpus: cpuset.NewCPUSet(),
stAssignments: state.ContainerCPUAssignments{},
stDefaultCPUSet: cpuset.NewCPUSet(2, 3, 4, 5, 6, 7),
pod: makePod("fakePod", "fakeContainer2", "1000m", "1000m"),
@@ -971,6 +984,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
topo: topoSingleSocketHT,
numReservedCPUs: 2,
reserved: cpuset.NewCPUSet(0, 1),
+ isolcpus: cpuset.NewCPUSet(),
stAssignments: state.ContainerCPUAssignments{
"fakePod": map[string]cpuset.CPUSet{
"fakeContainer100": cpuset.NewCPUSet(2, 3, 6, 7),
@@ -987,6 +1001,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
topo: topoSingleSocketHT,
numReservedCPUs: 2,
reserved: cpuset.NewCPUSet(0, 1),
+ isolcpus: cpuset.NewCPUSet(),
stAssignments: state.ContainerCPUAssignments{
"fakePod": map[string]cpuset.CPUSet{
"fakeContainer100": cpuset.NewCPUSet(2, 3, 6, 7),
@@ -998,11 +1013,29 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
expCPUAlloc: true,
expCSet: cpuset.NewCPUSet(0, 1),
},
+ {
+ description: "InfraPod, SingleSocketHT, Isolcpus, ExpectAllocReserved",
+ topo: topoSingleSocketHT,
+ numReservedCPUs: 2,
+ reserved: cpuset.NewCPUSet(0, 1),
+ isolcpus: cpuset.NewCPUSet(1),
+ stAssignments: state.ContainerCPUAssignments{
+ "fakePod": map[string]cpuset.CPUSet{
+ "fakeContainer100": cpuset.NewCPUSet(2, 3, 6, 7),
+ },
+ },
+ stDefaultCPUSet: cpuset.NewCPUSet(4, 5),
+ pod: infraPod,
+ expErr: nil,
+ expCPUAlloc: true,
+ expCSet: cpuset.NewCPUSet(0),
+ },
}
testExcl := true
+ testDM, _ := devicemanager.NewManagerStub()
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil, testExcl)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, testCase.isolcpus, topologymanager.NewFakeManager(), nil, testDM, testExcl)
st := &mockState{
assignments: testCase.stAssignments,
diff --git a/pkg/kubelet/cm/devicemanager/manager_stub.go b/pkg/kubelet/cm/devicemanager/manager_stub.go
new file mode 100644
index 00000000000..e6874f88d8a
--- /dev/null
+++ b/pkg/kubelet/cm/devicemanager/manager_stub.go
@@ -0,0 +1,99 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package devicemanager
+
+import (
+ v1 "k8s.io/api/core/v1"
+ "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
+ "k8s.io/kubernetes/pkg/kubelet/config"
+ "k8s.io/kubernetes/pkg/kubelet/lifecycle"
+ "k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
+ schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
+)
+
+// ManagerStub provides a simple stub implementation for the Device Manager.
+type ManagerStub struct{}
+
+// NewManagerStub creates a ManagerStub.
+func NewManagerStub() (*ManagerStub, error) {
+ return &ManagerStub{}, nil
+}
+
+// Start simply returns nil.
+func (h *ManagerStub) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady) error {
+ return nil
+}
+
+// Stop simply returns nil.
+func (h *ManagerStub) Stop() error {
+ return nil
+}
+
+// Allocate simply returns nil.
+func (h *ManagerStub) Allocate(pod *v1.Pod, container *v1.Container) error {
+ return nil
+}
+
+// UpdatePluginResources simply returns nil.
+func (h *ManagerStub) UpdatePluginResources(node *schedulerframework.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
+ return nil
+}
+
+// GetDeviceRunContainerOptions simply returns nil.
+func (h *ManagerStub) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) (*DeviceRunContainerOptions, error) {
+ return nil, nil
+}
+
+// GetCapacity simply returns nil capacity and empty removed resource list.
+func (h *ManagerStub) GetCapacity() (v1.ResourceList, v1.ResourceList, []string) {
+ return nil, nil, []string{}
+}
+
+// GetWatcherHandler returns plugin watcher interface
+func (h *ManagerStub) GetWatcherHandler() cache.PluginHandler {
+ return nil
+}
+
+// GetTopologyHints returns an empty TopologyHint map
+func (h *ManagerStub) GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
+ return map[string][]topologymanager.TopologyHint{}
+}
+
+// GetPodTopologyHints returns an empty TopologyHint map
+func (h *ManagerStub) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymanager.TopologyHint {
+ return map[string][]topologymanager.TopologyHint{}
+}
+
+// GetDevices returns nil
+func (h *ManagerStub) GetDevices(_, _ string) ResourceDeviceInstances {
+ return nil
+}
+
+// GetAllocatableDevices returns nothing
+func (h *ManagerStub) GetAllocatableDevices() ResourceDeviceInstances {
+ return nil
+}
+
+// ShouldResetExtendedResourceCapacity returns false
+func (h *ManagerStub) ShouldResetExtendedResourceCapacity() bool {
+ return false
+}
+
+// UpdateAllocatedDevices returns nothing
+func (h *ManagerStub) UpdateAllocatedDevices() {
+ return
+}
--
2.25.1

View File

@ -0,0 +1,356 @@
From 7b4e8029de25b57c25b510178a41ceddf556d428 Mon Sep 17 00:00:00 2001
From: Ramesh Kumar Sivanandam <rameshkumar.sivanandam@windriver.com>
Date: Mon, 7 Nov 2022 08:43:43 -0500
Subject: [PATCH 06/10] kubelet cpumanager keep normal containers off reserved CPUs
When starting the kubelet process, two separate sets of reserved CPUs
may be specified. With this change CPUs reserved via
'--system-reserved=cpu'
or '--kube-reserved=cpu' will be ignored by kubernetes itself. A small
tweak to the default CPU affinity ensures that "normal" Kubernetes
pods won't run on the reserved CPUs.
Co-authored-by: Jim Gauld <james.gauld@windriver.com>
Signed-off-by: Sachin Gopala Krishna <saching.krishna@windriver.com>
Signed-off-by: Ramesh Kumar Sivanandam <rameshkumar.sivanandam@windriver.com>
---
pkg/kubelet/cm/cpumanager/cpu_manager.go | 6 ++-
pkg/kubelet/cm/cpumanager/cpu_manager_test.go | 19 +++++++---
pkg/kubelet/cm/cpumanager/policy_static.go | 30 ++++++++++++---
.../cm/cpumanager/policy_static_test.go | 38 ++++++++++++++-----
4 files changed, 71 insertions(+), 22 deletions(-)
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go
index 9e2dce60501..e2c89efeb2e 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go
@@ -192,7 +192,11 @@ func NewManager(cpuPolicyName string, cpuPolicyOptions map[string]string, reconc
// exclusively allocated.
reservedCPUsFloat := float64(reservedCPUs.MilliValue()) / 1000
numReservedCPUs := int(math.Ceil(reservedCPUsFloat))
- policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, affinity, cpuPolicyOptions)
+ // NOTE: Set excludeReserved unconditionally to exclude reserved CPUs from default cpuset.
+ // This variable is primarily to make testing easier.
+ excludeReserved := true
+ policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, affinity, cpuPolicyOptions, excludeReserved)
+
if err != nil {
return nil, fmt.Errorf("new static policy error: %w", err)
}
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
index 29941611a53..e7c74453472 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
@@ -215,6 +215,7 @@ func makeMultiContainerPod(initCPUs, appCPUs []struct{ request, limit string })
}
func TestCPUManagerAdd(t *testing.T) {
+ testExcl := false
testPolicy, _ := NewStaticPolicy(
&topology.CPUTopology{
NumCPUs: 4,
@@ -230,7 +231,8 @@ func TestCPUManagerAdd(t *testing.T) {
0,
cpuset.NewCPUSet(),
topologymanager.NewFakeManager(),
- nil)
+ nil,
+ testExcl)
testCases := []struct {
description string
updateErr error
@@ -479,8 +481,9 @@ func TestCPUManagerAddWithInitContainers(t *testing.T) {
},
}
+ testExcl := false
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testExcl)
mockState := &mockState{
assignments: testCase.stAssignments,
@@ -705,6 +708,7 @@ func TestCPUManagerRemove(t *testing.T) {
}
func TestReconcileState(t *testing.T) {
+ testExcl := false
testPolicy, _ := NewStaticPolicy(
&topology.CPUTopology{
NumCPUs: 8,
@@ -724,7 +728,8 @@ func TestReconcileState(t *testing.T) {
0,
cpuset.NewCPUSet(),
topologymanager.NewFakeManager(),
- nil)
+ nil,
+ testExcl)
testCases := []struct {
description string
@@ -1228,6 +1233,7 @@ func TestReconcileState(t *testing.T) {
// above test cases are without kubelet --reserved-cpus cmd option
// the following tests are with --reserved-cpus configured
func TestCPUManagerAddWithResvList(t *testing.T) {
+ testExcl := false
testPolicy, _ := NewStaticPolicy(
&topology.CPUTopology{
NumCPUs: 4,
@@ -1243,7 +1249,8 @@ func TestCPUManagerAddWithResvList(t *testing.T) {
1,
cpuset.NewCPUSet(0),
topologymanager.NewFakeManager(),
- nil)
+ nil,
+ testExcl)
testCases := []struct {
description string
updateErr error
@@ -1368,6 +1375,7 @@ func TestCPUManagerHandlePolicyOptions(t *testing.T) {
}
func TestCPUManagerGetAllocatableCPUs(t *testing.T) {
+ testExcl := false
nonePolicy, _ := NewNonePolicy(nil)
staticPolicy, _ := NewStaticPolicy(
&topology.CPUTopology{
@@ -1384,7 +1392,8 @@ func TestCPUManagerGetAllocatableCPUs(t *testing.T) {
1,
cpuset.NewCPUSet(0),
topologymanager.NewFakeManager(),
- nil)
+ nil,
+ testExcl)
testCases := []struct {
description string
diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go
index 4d7f7c0b0de..4c4164a9099 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static.go
@@ -99,6 +99,8 @@ type staticPolicy struct {
topology *topology.CPUTopology
// set of CPUs that is not available for exclusive assignment
reserved cpuset.CPUSet
+ // If true, default CPUSet should exclude reserved CPUs
+ excludeReserved bool
// topology manager reference to get container Topology affinity
affinity topologymanager.Store
// set of CPUs to reuse across allocations in a pod
@@ -113,7 +115,7 @@ var _ Policy = &staticPolicy{}
// NewStaticPolicy returns a CPU manager policy that does not change CPU
// assignments for exclusively pinned guaranteed containers after the main
// container process starts.
-func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store, cpuPolicyOptions map[string]string) (Policy, error) {
+func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store, cpuPolicyOptions map[string]string, excludeReserved bool) (Policy, error) {
opts, err := NewStaticPolicyOptions(cpuPolicyOptions)
if err != nil {
return nil, err
@@ -128,6 +130,7 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv
policy := &staticPolicy{
topology: topology,
affinity: affinity,
+ excludeReserved: excludeReserved,
cpusToReuse: make(map[string]cpuset.CPUSet),
options: opts,
}
@@ -179,7 +182,15 @@ func (p *staticPolicy) validateState(s state.State) error {
}
// state is empty initialize
allCPUs := p.topology.CPUDetails.CPUs()
- s.SetDefaultCPUSet(allCPUs)
+ if p.excludeReserved {
+ // Exclude reserved CPUs from the default CPUSet to keep containers off them
+ // unless explicitly affined.
+ s.SetDefaultCPUSet(allCPUs.Difference(p.reserved))
+ } else {
+ s.SetDefaultCPUSet(allCPUs)
+ }
+ klog.Infof("[cpumanager] static policy: CPUSet: allCPUs:%v, reserved:%v, default:%v\n",
+ allCPUs, p.reserved, s.GetDefaultCPUSet())
return nil
}
@@ -187,11 +198,12 @@ func (p *staticPolicy) validateState(s state.State) error {
// 1. Check if the reserved cpuset is not part of default cpuset because:
// - kube/system reserved have changed (increased) - may lead to some containers not being able to start
// - user tampered with file
- if !p.reserved.Intersection(tmpDefaultCPUset).Equals(p.reserved) {
- return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"",
- p.reserved.String(), tmpDefaultCPUset.String())
+ if !p.excludeReserved {
+ if !p.reserved.Intersection(tmpDefaultCPUset).Equals(p.reserved) {
+ return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"",
+ p.reserved.String(), tmpDefaultCPUset.String())
+ }
}
-
// 2. Check if state for static policy is consistent
for pod := range tmpAssignments {
for container, cset := range tmpAssignments[pod] {
@@ -218,6 +230,9 @@ func (p *staticPolicy) validateState(s state.State) error {
}
}
totalKnownCPUs = totalKnownCPUs.UnionAll(tmpCPUSets)
+ if p.excludeReserved {
+ totalKnownCPUs = totalKnownCPUs.Union(p.reserved)
+ }
if !totalKnownCPUs.Equals(p.topology.CPUDetails.CPUs()) {
return fmt.Errorf("current set of available CPUs \"%s\" doesn't match with CPUs in state \"%s\"",
p.topology.CPUDetails.CPUs().String(), totalKnownCPUs.String())
@@ -331,6 +346,9 @@ func (p *staticPolicy) RemoveContainer(s state.State, podUID string, containerNa
cpusInUse := getAssignedCPUsOfSiblings(s, podUID, containerName)
if toRelease, ok := s.GetCPUSet(podUID, containerName); ok {
s.Delete(podUID, containerName)
+ if p.excludeReserved {
+ toRelease = toRelease.Difference(p.reserved)
+ }
// Mutate the shared pool, adding released cpus.
toRelease = toRelease.Difference(cpusInUse)
s.SetDefaultCPUSet(s.GetDefaultCPUSet().Union(toRelease))
diff --git a/pkg/kubelet/cm/cpumanager/policy_static_test.go b/pkg/kubelet/cm/cpumanager/policy_static_test.go
index 4c10af065a4..80a0c5a9e70 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static_test.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static_test.go
@@ -36,6 +36,7 @@ type staticPolicyTest struct {
description string
topo *topology.CPUTopology
numReservedCPUs int
+ excludeReserved bool
podUID string
options map[string]string
containerName string
@@ -68,7 +69,8 @@ func (spt staticPolicyTest) PseudoClone() staticPolicyTest {
}
func TestStaticPolicyName(t *testing.T) {
- policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil)
+ testExcl := false
+ policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testExcl)
policyName := policy.Name()
if policyName != "static" {
@@ -98,6 +100,15 @@ func TestStaticPolicyStart(t *testing.T) {
stDefaultCPUSet: cpuset.NewCPUSet(),
expCSet: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
},
+ {
+ description: "empty cpuset exclude reserved",
+ topo: topoDualSocketHT,
+ numReservedCPUs: 2,
+ excludeReserved: true,
+ stAssignments: state.ContainerCPUAssignments{},
+ stDefaultCPUSet: cpuset.NewCPUSet(),
+ expCSet: cpuset.NewCPUSet(1, 2, 3, 4, 5, 7, 8, 9, 10, 11),
+ },
{
description: "reserved cores 0 & 6 are not present in available cpuset",
topo: topoDualSocketHT,
@@ -144,7 +155,8 @@ func TestStaticPolicyStart(t *testing.T) {
}
for _, testCase := range testCases {
t.Run(testCase.description, func(t *testing.T) {
- p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil)
+ p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testCase.excludeReserved)
+
policy := p.(*staticPolicy)
st := &mockState{
assignments: testCase.stAssignments,
@@ -565,7 +577,8 @@ func runStaticPolicyTestCase(t *testing.T, testCase staticPolicyTest) {
if testCase.topologyHint != nil {
tm = topologymanager.NewFakeManagerWithHint(testCase.topologyHint)
}
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), tm, testCase.options)
+ testExcl := false
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), tm, testCase.options, testExcl)
st := &mockState{
assignments: testCase.stAssignments,
@@ -636,7 +649,7 @@ func TestStaticPolicyReuseCPUs(t *testing.T) {
}
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testCase.excludeReserved)
st := &mockState{
assignments: testCase.stAssignments,
@@ -668,6 +681,7 @@ func TestStaticPolicyReuseCPUs(t *testing.T) {
}
func TestStaticPolicyRemove(t *testing.T) {
+ excludeReserved := false
testCases := []staticPolicyTest{
{
description: "SingleSocketHT, DeAllocOneContainer",
@@ -726,7 +740,7 @@ func TestStaticPolicyRemove(t *testing.T) {
}
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, excludeReserved)
st := &mockState{
assignments: testCase.stAssignments,
@@ -748,6 +762,7 @@ func TestStaticPolicyRemove(t *testing.T) {
}
func TestTopologyAwareAllocateCPUs(t *testing.T) {
+ excludeReserved := false
testCases := []struct {
description string
topo *topology.CPUTopology
@@ -816,7 +831,7 @@ func TestTopologyAwareAllocateCPUs(t *testing.T) {
},
}
for _, tc := range testCases {
- p, _ := NewStaticPolicy(tc.topo, 0, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil)
+ p, _ := NewStaticPolicy(tc.topo, 0, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, excludeReserved)
policy := p.(*staticPolicy)
st := &mockState{
assignments: tc.stAssignments,
@@ -888,9 +903,11 @@ func TestStaticPolicyStartWithResvList(t *testing.T) {
expNewErr: fmt.Errorf("[cpumanager] unable to reserve the required amount of CPUs (size of 0-1 did not equal 1)"),
},
}
+ testExcl := false
for _, testCase := range testCases {
t.Run(testCase.description, func(t *testing.T) {
- p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil)
+ p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil, testExcl)
+
if !reflect.DeepEqual(err, testCase.expNewErr) {
t.Errorf("StaticPolicy Start() error (%v). expected error: %v but got: %v",
testCase.description, testCase.expNewErr, err)
@@ -930,7 +947,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
numReservedCPUs: 1,
reserved: cpuset.NewCPUSet(0),
stAssignments: state.ContainerCPUAssignments{},
- stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
+ stDefaultCPUSet: cpuset.NewCPUSet(1, 2, 3, 4, 5, 6, 7),
pod: makePod("fakePod", "fakeContainer2", "8000m", "8000m"),
expErr: fmt.Errorf("not enough cpus available to satisfy request"),
expCPUAlloc: false,
@@ -942,7 +959,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
numReservedCPUs: 2,
reserved: cpuset.NewCPUSet(0, 1),
stAssignments: state.ContainerCPUAssignments{},
- stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
+ stDefaultCPUSet: cpuset.NewCPUSet(2, 3, 4, 5, 6, 7),
pod: makePod("fakePod", "fakeContainer2", "1000m", "1000m"),
expErr: nil,
expCPUAlloc: true,
@@ -966,8 +983,9 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
},
}
+ testExcl := true
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil, testExcl)
st := &mockState{
assignments: testCase.stAssignments,
--
2.25.1

View File

@ -0,0 +1,50 @@
From d4aa04d78e4a2692e93c1fa638dd624720a8504a Mon Sep 17 00:00:00 2001
From: Jim Gauld <James.Gauld@windriver.com>
Date: Fri, 11 Feb 2022 11:06:35 -0500
Subject: [PATCH 04/10] kubelet: sort isolcpus allocation when SMT enabled
The existing device manager code returns CPUs as devices in unsorted
order. This numerically sorts isolcpus allocations when SMT/HT is
enabled on the host. This logs SMT pairs, singletons, and algorithm
order details to make the algorithm understandable.
Signed-off-by: Jim Gauld <James.Gauld@windriver.com>
---
pkg/kubelet/cm/devicemanager/manager.go | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go
index 191861d9e4a..4c897f0e032 100644
--- a/pkg/kubelet/cm/devicemanager/manager.go
+++ b/pkg/kubelet/cm/devicemanager/manager.go
@@ -545,7 +545,16 @@ func order_devices_by_sibling(devices sets.String, needed int) ([]string, error)
return cpu_lst[0]
}
}
+ //Make post-analysis of selection algorithm obvious by numerical sorting
+ //the available isolated cpu_id.
+ cpu_ids := make([]int, 0, int(devices.Len()))
for cpu_id := range devices {
+ cpu_id_, _ := strconv.Atoi(cpu_id)
+ cpu_ids = append(cpu_ids, cpu_id_)
+ }
+ sort.Ints(cpu_ids)
+ for _, _cpu_id := range cpu_ids {
+ cpu_id := strconv.Itoa(_cpu_id)
// If we've already found cpu_id as a sibling, skip it.
if _, ok := _iterated_cpu[cpu_id]; ok {
continue
@@ -587,7 +596,9 @@ func order_devices_by_sibling(devices sets.String, needed int) ([]string, error)
}
}
}
- //klog.Infof("needed=%d ordered_cpu_list=%v", needed, dev_lst)
+ //This algorithm will get some attention. Show minimal details.
+ klog.Infof("order_devices_by_sibling: needed=%d, smtpairs=%v, singletons=%v, order=%v",
+ needed, sibling_lst, single_lst, dev_lst)
return dev_lst, nil
}
func smt_enabled() bool {
--
2.25.1

View File

@ -0,0 +1,151 @@
From c054164c9012eb9dbe7a36775c9623fefb914996 Mon Sep 17 00:00:00 2001
From: Ramesh Kumar Sivanandam <rameshkumar.sivanandam@windriver.com>
Date: Tue, 25 Oct 2022 11:13:35 -0400
Subject: [PATCH 03/10] kubernetes: make isolcpus allocation SMT-aware
Enhance isolcpus support in Kubernetes to allocate isolated SMT
siblings to the same container when SMT/HT is enabled on the host.
As it stands, the device manager code in Kubernetes is not SMT-aware
(since normally it doesn't deal with CPUs). However, StarlingX
exposes isolated CPUs as devices and if possible we want to allocate
all SMT siblings from a CPU core to the same container in order to
minimize cross- container interference due to resource contention
within the CPU core.
The solution is basically to take the list of isolated CPUs and
re-order it so that the SMT siblings are next to each other. That
way the existing resource selection code will allocate the siblings
together. As an optimization, if it is known that an odd number
of isolated CPUs are desired, a singleton SMT sibling will be
inserted into the list to avoid breaking up sibling pairs.
Signed-off-by: Tao Wang <tao.wang@windriver.com>
Signed-off-by: Ramesh Kumar Sivanandam <rameshkumar.sivanandam@windriver.com>
---
pkg/kubelet/cm/devicemanager/manager.go | 84 ++++++++++++++++++++++++-
1 file changed, 83 insertions(+), 1 deletion(-)
diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go
index 8cb57aa8190..191861d9e4a 100644
--- a/pkg/kubelet/cm/devicemanager/manager.go
+++ b/pkg/kubelet/cm/devicemanager/manager.go
@@ -19,10 +19,13 @@ package devicemanager
import (
"context"
"fmt"
+ "io/ioutil"
"os"
"path/filepath"
"runtime"
"sort"
+ "strconv"
+ "strings"
"sync"
"time"
@@ -36,6 +39,7 @@ import (
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
+ "k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
"k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint"
plugin "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/plugin/v1beta1"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
@@ -526,6 +530,75 @@ func (m *ManagerImpl) UpdateAllocatedDevices() {
m.allocatedDevices = m.podDevices.devices()
}
+//Given a list of isolated CPUs in 'devices', and the number of desired CPUs in 'needed',
+//return an ordered list of isolated CPUs such that the first 'needed' CPUs in the list
+//contain as many hyperthread sibling pairs as possible.
+func order_devices_by_sibling(devices sets.String, needed int) ([]string, error) {
+ var dev_lst []string
+ var single_lst []string
+ sibling_lst := make([]string, 0, int(devices.Len()))
+ _iterated_cpu := make(map[string]string)
+ get_sibling := func(cpu string, cpu_lst []string) string {
+ if cpu_lst[0] == cpu {
+ return cpu_lst[1]
+ } else {
+ return cpu_lst[0]
+ }
+ }
+ for cpu_id := range devices {
+ // If we've already found cpu_id as a sibling, skip it.
+ if _, ok := _iterated_cpu[cpu_id]; ok {
+ continue
+ }
+ devPath := fmt.Sprintf("/sys/devices/system/cpu/cpu%s/topology/thread_siblings_list", cpu_id)
+ dat, err := ioutil.ReadFile(devPath)
+ if err != nil {
+ return dev_lst, fmt.Errorf("Can't read cpu[%s] thread_siblings_list", cpu_id)
+ }
+ cpustring := strings.TrimSuffix(string(dat), "\n")
+ cpu_pair_set, err := cpuset.Parse(cpustring)
+ if err != nil {
+ return dev_lst, fmt.Errorf("Unable to parse thread_siblings_list[%s] string to cpuset", cpustring)
+ }
+ var cpu_pair_lst []string
+ for _, v := range cpu_pair_set.ToSlice() {
+ cpu_pair_lst = append(cpu_pair_lst, strconv.Itoa(v))
+ }
+ sibling_cpu_id := get_sibling(cpu_id, cpu_pair_lst)
+ if _, ok := devices[sibling_cpu_id]; ok {
+ sibling_lst = append(sibling_lst, cpu_id, sibling_cpu_id)
+ _iterated_cpu[sibling_cpu_id] = ""
+ } else {
+ single_lst = append(single_lst, cpu_id)
+ }
+ _iterated_cpu[cpu_id] = ""
+ }
+ if needed%2 == 0 {
+ dev_lst = append(sibling_lst, single_lst...)
+ } else {
+ if len(single_lst) > 1 {
+ _tmp_list := append(sibling_lst, single_lst[1:]...)
+ dev_lst = append(single_lst[0:1], _tmp_list...)
+ } else {
+ if len(single_lst) == 0 {
+ dev_lst = sibling_lst
+ } else {
+ dev_lst = append(single_lst, sibling_lst...)
+ }
+ }
+ }
+ //klog.Infof("needed=%d ordered_cpu_list=%v", needed, dev_lst)
+ return dev_lst, nil
+}
+func smt_enabled() bool {
+ dat, _ := ioutil.ReadFile("/sys/devices/system/cpu/smt/active")
+ state := strings.TrimSuffix(string(dat), "\n")
+ if state == "0" {
+ return false
+ }
+ return true
+}
+
// Returns list of device Ids we need to allocate with Allocate rpc call.
// Returns empty list in case we don't need to issue the Allocate rpc call.
func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, required int, reusableDevices sets.String) (sets.String, error) {
@@ -561,7 +634,16 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
// Create a closure to help with device allocation
// Returns 'true' once no more devices need to be allocated.
allocateRemainingFrom := func(devices sets.String) bool {
- for device := range devices.Difference(allocated) {
+ availableDevices := devices.Difference(allocated).List()
+ // If we're dealing with isolcpus and SMT is enabled, reorder to group SMT siblings together.
+ if resource == "windriver.com/isolcpus" && len(devices) > 0 && smt_enabled() {
+ var err error
+ availableDevices, err = order_devices_by_sibling(devices.Difference(allocated), needed)
+ if err != nil {
+ klog.Errorf("error in order_devices_by_sibling: %v", err)
+ }
+ }
+ for _, device := range availableDevices {
m.allocatedDevices[resource].Insert(device)
allocated.Insert(device)
needed--
--
2.25.1

View File

@ -0,0 +1,10 @@
kubeadm-create-platform-pods-with-zero-CPU-resources.patch
Revert-use-subpath-for-coredns-only-for-default-repo.patch
kubernetes-make-isolcpus-allocation-SMT-aware.patch
kubelet-sort-isolcpus-allocation-when-SMT-enabled.patch
kubelet-cpumanager-disable-CFS-quota-throttling.patch
kubelet-cpumanager-keep-normal-containers-off-reserv.patch
kubelet-cpumanager-infra-pods-use-system-reserved-CP.patch
kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch
enable-support-for-kubernetes-to-ignore-isolcpus.patch
kubelet-CFS-quota-throttling-for-non-integer-cpulimit.patch