diff --git a/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/Revert-use-subpath-for-coredns-only-for-default-repo.patch b/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/Revert-use-subpath-for-coredns-only-for-default-repo.patch new file mode 100644 index 000000000..e0dfcb8fd --- /dev/null +++ b/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/Revert-use-subpath-for-coredns-only-for-default-repo.patch @@ -0,0 +1,120 @@ +From c20052041d1786a78f796996aceca09de76613b7 Mon Sep 17 00:00:00 2001 +From: Ramesh Kumar Sivanandam +Date: Tue, 25 Oct 2022 19:44:44 -0400 +Subject: [PATCH 02/10] Revert "use subpath for coredns only for default + repository" + +This reverts commit 38a41e1557649a7cc763bf737779db9aa03ec75e. + +Co-authored-by: Jim Gauld +Signed-off-by: Gleb Aronsky +Signed-off-by: Ramesh Kumar Sivanandam +--- + cmd/kubeadm/app/constants/constants.go | 2 +- + cmd/kubeadm/app/images/images.go | 5 --- + cmd/kubeadm/app/images/images_test.go | 49 -------------------------- + 3 files changed, 1 insertion(+), 55 deletions(-) + +diff --git a/cmd/kubeadm/app/constants/constants.go b/cmd/kubeadm/app/constants/constants.go +index 544b5e96828..31ae3502d00 100644 +--- a/cmd/kubeadm/app/constants/constants.go ++++ b/cmd/kubeadm/app/constants/constants.go +@@ -342,7 +342,7 @@ const ( + CoreDNSDeploymentName = "coredns" + + // CoreDNSImageName specifies the name of the image for CoreDNS add-on +- CoreDNSImageName = "coredns" ++ CoreDNSImageName = "coredns/coredns" + + // CoreDNSVersion is the version of CoreDNS to be deployed if it is used + CoreDNSVersion = "v1.9.3" +diff --git a/cmd/kubeadm/app/images/images.go b/cmd/kubeadm/app/images/images.go +index 81c787b77a3..5099b260530 100644 +--- a/cmd/kubeadm/app/images/images.go ++++ b/cmd/kubeadm/app/images/images.go +@@ -22,7 +22,6 @@ import ( + "k8s.io/klog/v2" + + kubeadmapi "k8s.io/kubernetes/cmd/kubeadm/app/apis/kubeadm" +- kubeadmapiv1beta2 "k8s.io/kubernetes/cmd/kubeadm/app/apis/kubeadm/v1beta2" + "k8s.io/kubernetes/cmd/kubeadm/app/constants" + kubeadmutil "k8s.io/kubernetes/cmd/kubeadm/app/util" + ) +@@ -48,10 +47,6 @@ func GetDNSImage(cfg *kubeadmapi.ClusterConfiguration) string { + if cfg.DNS.ImageRepository != "" { + dnsImageRepository = cfg.DNS.ImageRepository + } +- // Handle the renaming of the official image from "registry.k8s.io/coredns" to "registry.k8s.io/coredns/coredns +- if dnsImageRepository == kubeadmapiv1beta2.DefaultImageRepository { +- dnsImageRepository = fmt.Sprintf("%s/coredns", dnsImageRepository) +- } + // DNS uses an imageTag that corresponds to the DNS version matching the Kubernetes version + dnsImageTag := constants.CoreDNSVersion + +diff --git a/cmd/kubeadm/app/images/images_test.go b/cmd/kubeadm/app/images/images_test.go +index 60927ef9493..e24d4feca23 100644 +--- a/cmd/kubeadm/app/images/images_test.go ++++ b/cmd/kubeadm/app/images/images_test.go +@@ -22,7 +22,6 @@ import ( + "testing" + + kubeadmapi "k8s.io/kubernetes/cmd/kubeadm/app/apis/kubeadm" +- kubeadmapiv1beta2 "k8s.io/kubernetes/cmd/kubeadm/app/apis/kubeadm/v1beta2" + "k8s.io/kubernetes/cmd/kubeadm/app/constants" + ) + +@@ -227,51 +226,3 @@ func TestGetAllImages(t *testing.T) { + } + } + +-func TestGetDNSImage(t *testing.T) { +- var tests = []struct { +- expected string +- cfg *kubeadmapi.ClusterConfiguration +- }{ +- { +- expected: "foo.io/coredns:v1.9.3", +- cfg: &kubeadmapi.ClusterConfiguration{ +- ImageRepository: "foo.io", +- DNS: kubeadmapi.DNS{ +- Type: kubeadmapi.CoreDNS, +- }, +- }, +- }, +- { +- expected: kubeadmapiv1beta2.DefaultImageRepository + "/coredns/coredns:v1.9.3", +- cfg: &kubeadmapi.ClusterConfiguration{ +- ImageRepository: kubeadmapiv1beta2.DefaultImageRepository, +- DNS: kubeadmapi.DNS{ +- Type: kubeadmapi.CoreDNS, +- }, +- }, +- }, +- { +- expected: "foo.io/coredns/coredns:v1.9.3", +- cfg: &kubeadmapi.ClusterConfiguration{ +- ImageRepository: "foo.io", +- DNS: kubeadmapi.DNS{ +- Type: kubeadmapi.CoreDNS, +- ImageMeta: kubeadmapi.ImageMeta{ +- ImageRepository: "foo.io/coredns", +- }, +- }, +- }, +- }, +- } +- +- for _, test := range tests { +- actual := GetDNSImage(test.cfg) +- if actual != test.expected { +- t.Errorf( +- "failed to GetDNSImage:\n\texpected: %s\n\t actual: %s", +- test.expected, +- actual, +- ) +- } +- } +-} +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/enable-support-for-kubernetes-to-ignore-isolcpus.patch b/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/enable-support-for-kubernetes-to-ignore-isolcpus.patch new file mode 100644 index 000000000..466b2bbca --- /dev/null +++ b/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/enable-support-for-kubernetes-to-ignore-isolcpus.patch @@ -0,0 +1,80 @@ +From 47e087ce7045aaf74cb4f3c845785a4cca101c4d Mon Sep 17 00:00:00 2001 +From: Chris Friesen +Date: Fri, 23 Oct 2020 17:46:10 -0600 +Subject: [PATCH 09/10] enable support for kubernetes to ignore isolcpus + +The normal mechanisms for allocating isolated CPUs do not allow +a mix of isolated and exclusive CPUs in the same container. In +order to allow this in *very* limited cases where the pod spec +is known in advance we will add the ability to disable the normal +isolcpus behaviour. + +If the file "/etc/kubernetes/ignore_isolcpus" exists, then kubelet +will basically forget everything it knows about isolcpus and just +treat them like regular CPUs. + +The admin user can then rely on the fact that CPU allocation is +deterministic to ensure that the isolcpus they configure end up being +allocated to the correct pods. + +Signed-off-by: Daniel Safta +Signed-off-by: Ramesh Kumar Sivanandam +--- + pkg/kubelet/cm/cpumanager/cpu_manager.go | 8 ++++++++ + pkg/kubelet/cm/cpumanager/policy_static.go | 7 +++++++ + 2 files changed, 15 insertions(+) + +diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go +index 95a4246e840..846a6d4fbed 100644 +--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go ++++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go +@@ -21,6 +21,7 @@ import ( + "fmt" + "io/ioutil" + "math" ++ "os" + "strings" + "sync" + "time" +@@ -56,6 +57,13 @@ const cpuManagerStateFileName = "cpu_manager_state" + + // get the system-level isolated CPUs + func getIsolcpus() cpuset.CPUSet { ++ // This is a gross hack to basically turn off awareness of isolcpus to enable ++ // isolated cpus to be allocated to pods the same way as non-isolated CPUs. ++ if _, err := os.Stat("/etc/kubernetes/ignore_isolcpus"); err == nil { ++ klog.Infof("[cpumanager] turning off isolcpus awareness") ++ return cpuset.NewCPUSet() ++ } ++ + dat, err := ioutil.ReadFile("/sys/devices/system/cpu/isolated") + if err != nil { + klog.Errorf("[cpumanager] unable to read sysfs isolcpus subdir") +diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go +index 8d18ce65309..981e825ff57 100644 +--- a/pkg/kubelet/cm/cpumanager/policy_static.go ++++ b/pkg/kubelet/cm/cpumanager/policy_static.go +@@ -18,6 +18,7 @@ package cpumanager + + import ( + "fmt" ++ "os" + "strconv" + + v1 "k8s.io/api/core/v1" +@@ -700,6 +701,12 @@ func isKubeInfra(pod *v1.Pod) bool { + + // get the isolated CPUs (if any) from the devices associated with a specific container + func (p *staticPolicy) podIsolCPUs(pod *v1.Pod, container *v1.Container) cpuset.CPUSet { ++ // This is a gross hack to basically turn off awareness of isolcpus to enable ++ // isolated cpus to be allocated to pods the same way as non-isolated CPUs. ++ if _, err := os.Stat("/etc/kubernetes/ignore_isolcpus"); err == nil { ++ return cpuset.NewCPUSet() ++ } ++ + // NOTE: This is required for TestStaticPolicyAdd() since makePod() does + // not create UID. We also need a way to properly stub devicemanager. + if len(string(pod.UID)) == 0 { +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/kubeadm-create-platform-pods-with-zero-CPU-resources.patch b/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/kubeadm-create-platform-pods-with-zero-CPU-resources.patch new file mode 100644 index 000000000..6c7a09164 --- /dev/null +++ b/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/kubeadm-create-platform-pods-with-zero-CPU-resources.patch @@ -0,0 +1,108 @@ +From 721ddbe3f2d5a83eaa4982dde371f9ab4cc48cf6 Mon Sep 17 00:00:00 2001 +From: Chris Friesen +Date: Fri, 3 Sep 2021 18:05:15 -0400 +Subject: [PATCH 01/10] kubeadm: create platform pods with zero CPU resources + +We want to specify zero CPU resources when creating the manifests +for the static platform pods, as a workaround for the lack of +separate resource tracking for platform resources. + +We also specify zero CPU resources for the coredns deployment. +manifests.go appears to be the main file for this, not sure if the +others are used but I changed them just in case. + +Signed-off-by: Daniel Safta +--- + cluster/addons/dns/coredns/coredns.yaml.base | 2 +- + cluster/addons/dns/coredns/coredns.yaml.in | 2 +- + cluster/addons/dns/coredns/coredns.yaml.sed | 2 +- + cmd/kubeadm/app/phases/addons/dns/manifests.go | 2 +- + cmd/kubeadm/app/phases/controlplane/manifests.go | 6 +++--- + 5 files changed, 7 insertions(+), 7 deletions(-) + +diff --git a/cluster/addons/dns/coredns/coredns.yaml.base b/cluster/addons/dns/coredns/coredns.yaml.base +index e03559423e6..bda0ff6059f 100644 +--- a/cluster/addons/dns/coredns/coredns.yaml.base ++++ b/cluster/addons/dns/coredns/coredns.yaml.base +@@ -145,7 +145,7 @@ spec: + limits: + memory: __DNS__MEMORY__LIMIT__ + requests: +- cpu: 100m ++ cpu: 0 + memory: 70Mi + args: [ "-conf", "/etc/coredns/Corefile" ] + volumeMounts: +diff --git a/cluster/addons/dns/coredns/coredns.yaml.in b/cluster/addons/dns/coredns/coredns.yaml.in +index 9b241370bea..e39d37e9e03 100644 +--- a/cluster/addons/dns/coredns/coredns.yaml.in ++++ b/cluster/addons/dns/coredns/coredns.yaml.in +@@ -145,7 +145,7 @@ spec: + limits: + memory: 'dns_memory_limit' + requests: +- cpu: 100m ++ cpu: 0 + memory: 70Mi + args: [ "-conf", "/etc/coredns/Corefile" ] + volumeMounts: +diff --git a/cluster/addons/dns/coredns/coredns.yaml.sed b/cluster/addons/dns/coredns/coredns.yaml.sed +index 561fdf9aea8..186cce37950 100644 +--- a/cluster/addons/dns/coredns/coredns.yaml.sed ++++ b/cluster/addons/dns/coredns/coredns.yaml.sed +@@ -145,7 +145,7 @@ spec: + limits: + memory: $DNS_MEMORY_LIMIT + requests: +- cpu: 100m ++ cpu: 0 + memory: 70Mi + args: [ "-conf", "/etc/coredns/Corefile" ] + volumeMounts: +diff --git a/cmd/kubeadm/app/phases/addons/dns/manifests.go b/cmd/kubeadm/app/phases/addons/dns/manifests.go +index 0e3c6c98c29..97c5ff96d43 100644 +--- a/cmd/kubeadm/app/phases/addons/dns/manifests.go ++++ b/cmd/kubeadm/app/phases/addons/dns/manifests.go +@@ -104,7 +104,7 @@ spec: + limits: + memory: 170Mi + requests: +- cpu: 100m ++ cpu: 0 + memory: 70Mi + args: [ "-conf", "/etc/coredns/Corefile" ] + volumeMounts: +diff --git a/cmd/kubeadm/app/phases/controlplane/manifests.go b/cmd/kubeadm/app/phases/controlplane/manifests.go +index 73f4fa56270..da52342a6f6 100644 +--- a/cmd/kubeadm/app/phases/controlplane/manifests.go ++++ b/cmd/kubeadm/app/phases/controlplane/manifests.go +@@ -63,7 +63,7 @@ func GetStaticPodSpecs(cfg *kubeadmapi.ClusterConfiguration, endpoint *kubeadmap + LivenessProbe: staticpodutil.LivenessProbe(staticpodutil.GetAPIServerProbeAddress(endpoint), "/livez", int(endpoint.BindPort), v1.URISchemeHTTPS), + ReadinessProbe: staticpodutil.ReadinessProbe(staticpodutil.GetAPIServerProbeAddress(endpoint), "/readyz", int(endpoint.BindPort), v1.URISchemeHTTPS), + StartupProbe: staticpodutil.StartupProbe(staticpodutil.GetAPIServerProbeAddress(endpoint), "/livez", int(endpoint.BindPort), v1.URISchemeHTTPS, cfg.APIServer.TimeoutForControlPlane), +- Resources: staticpodutil.ComponentResources("250m"), ++ Resources: staticpodutil.ComponentResources("0"), + Env: kubeadmutil.GetProxyEnvVars(), + }, mounts.GetVolumes(kubeadmconstants.KubeAPIServer), + map[string]string{kubeadmconstants.KubeAPIServerAdvertiseAddressEndpointAnnotationKey: endpoint.String()}), +@@ -75,7 +75,7 @@ func GetStaticPodSpecs(cfg *kubeadmapi.ClusterConfiguration, endpoint *kubeadmap + VolumeMounts: staticpodutil.VolumeMountMapToSlice(mounts.GetVolumeMounts(kubeadmconstants.KubeControllerManager)), + LivenessProbe: staticpodutil.LivenessProbe(staticpodutil.GetControllerManagerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeControllerManagerPort, v1.URISchemeHTTPS), + StartupProbe: staticpodutil.StartupProbe(staticpodutil.GetControllerManagerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeControllerManagerPort, v1.URISchemeHTTPS, cfg.APIServer.TimeoutForControlPlane), +- Resources: staticpodutil.ComponentResources("200m"), ++ Resources: staticpodutil.ComponentResources("0"), + Env: kubeadmutil.GetProxyEnvVars(), + }, mounts.GetVolumes(kubeadmconstants.KubeControllerManager), nil), + kubeadmconstants.KubeScheduler: staticpodutil.ComponentPod(v1.Container{ +@@ -86,7 +86,7 @@ func GetStaticPodSpecs(cfg *kubeadmapi.ClusterConfiguration, endpoint *kubeadmap + VolumeMounts: staticpodutil.VolumeMountMapToSlice(mounts.GetVolumeMounts(kubeadmconstants.KubeScheduler)), + LivenessProbe: staticpodutil.LivenessProbe(staticpodutil.GetSchedulerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeSchedulerPort, v1.URISchemeHTTPS), + StartupProbe: staticpodutil.StartupProbe(staticpodutil.GetSchedulerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeSchedulerPort, v1.URISchemeHTTPS, cfg.APIServer.TimeoutForControlPlane), +- Resources: staticpodutil.ComponentResources("100m"), ++ Resources: staticpodutil.ComponentResources("0"), + Env: kubeadmutil.GetProxyEnvVars(), + }, mounts.GetVolumes(kubeadmconstants.KubeScheduler), nil), + } +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/kubelet-CFS-quota-throttling-for-non-integer-cpulimit.patch b/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/kubelet-CFS-quota-throttling-for-non-integer-cpulimit.patch new file mode 100644 index 000000000..a80519407 --- /dev/null +++ b/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/kubelet-CFS-quota-throttling-for-non-integer-cpulimit.patch @@ -0,0 +1,30 @@ +From 763da9f5ced5bb40cfc314e0b8199bcf46742f14 Mon Sep 17 00:00:00 2001 +From: Boovan Rajendran +Date: Wed, 30 Nov 2022 04:17:19 -0500 +Subject: [PATCH 10/10] kubelet CFS quota throttling for non integer cpulimit + +Signed-off-by: Boovan Rajendran +--- + pkg/kubelet/cm/internal_container_lifecycle_linux.go | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/pkg/kubelet/cm/internal_container_lifecycle_linux.go b/pkg/kubelet/cm/internal_container_lifecycle_linux.go +index 75406dd8564..05366ab6fcb 100644 +--- a/pkg/kubelet/cm/internal_container_lifecycle_linux.go ++++ b/pkg/kubelet/cm/internal_container_lifecycle_linux.go +@@ -39,7 +39,11 @@ func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, contain + // Disable cgroup CFS throttle at the container level. + // /sys/fs/cgroup/cpu/k8s-infra/kubepods///cpu.cfs_quota_us + // /sys/fs/cgroup/cpu/k8s-infra/kubepods///cpu.cfs_period_us +- if i.cpuManager.GetCPUPolicy() == "static" && v1qos.GetPodQOS(pod) == v1.PodQOSGuaranteed { ++ // We can only set CpuQuota to -1 if we're allocating the entire CPU. ++ // For fractional CPUs the CpuQuota is needed to enforce the limit. ++ cpuQuantity := container.Resources.Requests[v1.ResourceCPU] ++ fractionalCpuQuantity := cpuQuantity.MilliValue()%1000 ++ if i.cpuManager.GetCPUPolicy() == "static" && v1qos.GetPodQOS(pod) == v1.PodQOSGuaranteed && fractionalCpuQuantity == 0 { + containerConfig.Linux.Resources.CpuPeriod = int64(100000) + containerConfig.Linux.Resources.CpuQuota = int64(-1) + } +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/kubelet-cpumanager-disable-CFS-quota-throttling.patch b/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/kubelet-cpumanager-disable-CFS-quota-throttling.patch new file mode 100644 index 000000000..e2ff23c9b --- /dev/null +++ b/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/kubelet-cpumanager-disable-CFS-quota-throttling.patch @@ -0,0 +1,255 @@ +From 403a466bb82d8c285d6a2b814e467bb949cc9ca3 Mon Sep 17 00:00:00 2001 +From: Sachin Gopala Krishna +Date: Mon, 3 Oct 2022 19:19:48 -0400 +Subject: [PATCH 05/10] kubelet cpumanager disable CFS quota throttling + +This disables CFS CPU quota to avoid performance degradation due to +Linux kernel CFS quota implementation. Note that 4.18 kernel attempts +to solve the CFS throttling problem, but there are reports that it is +not completely effective. + +This disables CFS quota throttling for Guaranteed pods for both +parent and container cgroups by writing -1 to cgroup cpu.cfs_quota_us. +Disabling has a dramatic latency improvement for HTTP response times. + +This patch is refactored in 1.22.5 due to new internal_container_lifecycle +framework. We leverage the same mechanism to set Linux resources as: +cpu manager: specify the container CPU set during the creation + +Co-authored-by: Jim Gauld +Signed-off-by: Sachin Gopala Krishna +--- + pkg/kubelet/cm/cpumanager/cpu_manager.go | 7 +++ + pkg/kubelet/cm/cpumanager/fake_cpu_manager.go | 10 ++++- + pkg/kubelet/cm/helpers_linux.go | 10 +++++ + pkg/kubelet/cm/helpers_linux_test.go | 43 ++++++++++--------- + .../cm/internal_container_lifecycle_linux.go | 9 ++++ + 5 files changed, 57 insertions(+), 22 deletions(-) + +diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go +index 443eecd2d36..9e2dce60501 100644 +--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go ++++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go +@@ -73,6 +73,9 @@ type Manager interface { + // State returns a read-only interface to the internal CPU manager state. + State() state.Reader + ++ // GetCPUPolicy returns the assigned CPU manager policy ++ GetCPUPolicy() string ++ + // GetTopologyHints implements the topologymanager.HintProvider Interface + // and is consulted to achieve NUMA aware resource alignment among this + // and other resource controllers. +@@ -315,6 +318,10 @@ func (m *manager) State() state.Reader { + return m.state + } + ++func (m *manager) GetCPUPolicy() string { ++ return m.policy.Name() ++} ++ + func (m *manager) GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint { + // The pod is during the admission phase. We need to save the pod to avoid it + // being cleaned before the admission ended +diff --git a/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go b/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go +index 93369705135..2e277da9c84 100644 +--- a/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go ++++ b/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go +@@ -28,7 +28,8 @@ import ( + ) + + type fakeManager struct { +- state state.State ++ policy Policy ++ state state.State + } + + func (m *fakeManager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap) error { +@@ -70,6 +71,10 @@ func (m *fakeManager) State() state.Reader { + return m.state + } + ++func (m *fakeManager) GetCPUPolicy() string { ++ return m.policy.Name() ++} ++ + func (m *fakeManager) GetExclusiveCPUs(podUID, containerName string) cpuset.CPUSet { + klog.InfoS("GetExclusiveCPUs", "podUID", podUID, "containerName", containerName) + return cpuset.CPUSet{} +@@ -88,6 +93,7 @@ func (m *fakeManager) GetCPUAffinity(podUID, containerName string) cpuset.CPUSet + // NewFakeManager creates empty/fake cpu manager + func NewFakeManager() Manager { + return &fakeManager{ +- state: state.NewMemoryState(), ++ policy: &nonePolicy{}, ++ state: state.NewMemoryState(), + } + } +diff --git a/pkg/kubelet/cm/helpers_linux.go b/pkg/kubelet/cm/helpers_linux.go +index e0292496fe9..9a22bb2d312 100644 +--- a/pkg/kubelet/cm/helpers_linux.go ++++ b/pkg/kubelet/cm/helpers_linux.go +@@ -186,6 +186,16 @@ func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64, + // build the result + result := &ResourceConfig{} + if qosClass == v1.PodQOSGuaranteed { ++ // Disable CFS CPU quota to avoid performance degradation due to ++ // Linux kernel CFS throttle implementation. ++ // NOTE: 4.18 kernel attempts to solve CFS throttling problem, ++ // but there are reports that it is not completely effective. ++ // This will configure cgroup CFS parameters at pod level: ++ // /sys/fs/cgroup/cpu/k8s-infra/kubepods//cpu.cfs_quota_us ++ // /sys/fs/cgroup/cpu/k8s-infra/kubepods//cpu.cfs_period_us ++ cpuQuota = int64(-1) ++ cpuPeriod = uint64(100000) ++ + result.CpuShares = &cpuShares + result.CpuQuota = &cpuQuota + result.CpuPeriod = &cpuPeriod +diff --git a/pkg/kubelet/cm/helpers_linux_test.go b/pkg/kubelet/cm/helpers_linux_test.go +index 9296ea29e2a..08c5a92cd4c 100644 +--- a/pkg/kubelet/cm/helpers_linux_test.go ++++ b/pkg/kubelet/cm/helpers_linux_test.go +@@ -64,8 +64,9 @@ func TestResourceConfigForPod(t *testing.T) { + burstablePartialShares := MilliCPUToShares(200) + burstableQuota := MilliCPUToQuota(200, int64(defaultQuotaPeriod)) + guaranteedShares := MilliCPUToShares(100) +- guaranteedQuota := MilliCPUToQuota(100, int64(defaultQuotaPeriod)) +- guaranteedTunedQuota := MilliCPUToQuota(100, int64(tunedQuotaPeriod)) ++ guaranteedQuotaPeriod := uint64(100000) ++ guaranteedQuota := int64(-1) ++ guaranteedTunedQuota := int64(-1) + memoryQuantity = resource.MustParse("100Mi") + cpuNoLimit := int64(-1) + guaranteedMemory := memoryQuantity.Value() +@@ -204,8 +205,8 @@ func TestResourceConfigForPod(t *testing.T) { + }, + }, + enforceCPULimits: true, +- quotaPeriod: defaultQuotaPeriod, +- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedQuota, CpuPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory}, ++ quotaPeriod: guaranteedQuotaPeriod, ++ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedQuota, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, + }, + "guaranteed-no-cpu-enforcement": { + pod: &v1.Pod{ +@@ -218,8 +219,8 @@ func TestResourceConfigForPod(t *testing.T) { + }, + }, + enforceCPULimits: false, +- quotaPeriod: defaultQuotaPeriod, +- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory}, ++ quotaPeriod: guaranteedQuotaPeriod, ++ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, + }, + "guaranteed-with-tuned-quota": { + pod: &v1.Pod{ +@@ -232,8 +233,8 @@ func TestResourceConfigForPod(t *testing.T) { + }, + }, + enforceCPULimits: true, +- quotaPeriod: tunedQuotaPeriod, +- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedTunedQuota, CpuPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory}, ++ quotaPeriod: guaranteedQuotaPeriod, ++ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedTunedQuota, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, + }, + "guaranteed-no-cpu-enforcement-with-tuned-quota": { + pod: &v1.Pod{ +@@ -246,8 +247,8 @@ func TestResourceConfigForPod(t *testing.T) { + }, + }, + enforceCPULimits: false, +- quotaPeriod: tunedQuotaPeriod, +- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory}, ++ quotaPeriod: guaranteedQuotaPeriod, ++ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, + }, + "burstable-partial-limits-with-init-containers": { + pod: &v1.Pod{ +@@ -309,8 +310,10 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) { + burstablePartialShares := MilliCPUToShares(200) + burstableQuota := MilliCPUToQuota(200, int64(defaultQuotaPeriod)) + guaranteedShares := MilliCPUToShares(100) +- guaranteedQuota := MilliCPUToQuota(100, int64(defaultQuotaPeriod)) +- guaranteedTunedQuota := MilliCPUToQuota(100, int64(tunedQuotaPeriod)) ++ guaranteedQuotaPeriod := uint64(100000) ++ guaranteedQuota := int64(-1) ++ guaranteedTunedQuota := int64(-1) ++ + memoryQuantity = resource.MustParse("100Mi") + cpuNoLimit := int64(-1) + guaranteedMemory := memoryQuantity.Value() +@@ -449,8 +452,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) { + }, + }, + enforceCPULimits: true, +- quotaPeriod: defaultQuotaPeriod, +- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedQuota, CpuPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory}, ++ quotaPeriod: guaranteedQuotaPeriod, ++ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedQuota, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, + }, + "guaranteed-no-cpu-enforcement": { + pod: &v1.Pod{ +@@ -463,8 +466,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) { + }, + }, + enforceCPULimits: false, +- quotaPeriod: defaultQuotaPeriod, +- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory}, ++ quotaPeriod: guaranteedQuotaPeriod, ++ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, + }, + "guaranteed-with-tuned-quota": { + pod: &v1.Pod{ +@@ -477,8 +480,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) { + }, + }, + enforceCPULimits: true, +- quotaPeriod: tunedQuotaPeriod, +- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedTunedQuota, CpuPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory}, ++ quotaPeriod: guaranteedQuotaPeriod, ++ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedTunedQuota, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, + }, + "guaranteed-no-cpu-enforcement-with-tuned-quota": { + pod: &v1.Pod{ +@@ -491,8 +494,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) { + }, + }, + enforceCPULimits: false, +- quotaPeriod: tunedQuotaPeriod, +- expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory}, ++ quotaPeriod: guaranteedQuotaPeriod, ++ expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory}, + }, + } + +diff --git a/pkg/kubelet/cm/internal_container_lifecycle_linux.go b/pkg/kubelet/cm/internal_container_lifecycle_linux.go +index cb7c0cfa543..75406dd8564 100644 +--- a/pkg/kubelet/cm/internal_container_lifecycle_linux.go ++++ b/pkg/kubelet/cm/internal_container_lifecycle_linux.go +@@ -25,6 +25,7 @@ import ( + + "k8s.io/api/core/v1" + runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" ++ v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" + ) + + func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, container *v1.Container, containerConfig *runtimeapi.ContainerConfig) error { +@@ -35,6 +36,14 @@ func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, contain + } + } + ++ // Disable cgroup CFS throttle at the container level. ++ // /sys/fs/cgroup/cpu/k8s-infra/kubepods///cpu.cfs_quota_us ++ // /sys/fs/cgroup/cpu/k8s-infra/kubepods///cpu.cfs_period_us ++ if i.cpuManager.GetCPUPolicy() == "static" && v1qos.GetPodQOS(pod) == v1.PodQOSGuaranteed { ++ containerConfig.Linux.Resources.CpuPeriod = int64(100000) ++ containerConfig.Linux.Resources.CpuQuota = int64(-1) ++ } ++ + if i.memoryManager != nil { + numaNodes := i.memoryManager.GetMemoryNUMANodes(pod, container) + if numaNodes.Len() > 0 { +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/kubelet-cpumanager-infra-pods-use-system-reserved-CP.patch b/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/kubelet-cpumanager-infra-pods-use-system-reserved-CP.patch new file mode 100644 index 000000000..2f60d6604 --- /dev/null +++ b/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/kubelet-cpumanager-infra-pods-use-system-reserved-CP.patch @@ -0,0 +1,166 @@ +From 6c9e22271647302c86578243de6d124ede78d829 Mon Sep 17 00:00:00 2001 +From: Ramesh Kumar Sivanandam +Date: Mon, 7 Nov 2022 09:48:01 -0500 +Subject: [PATCH 07/10] kubelet cpumanager infra pods use system reserved CPUs + +This assigns system infrastructure pods to the "reserved" cpuset +to isolate them from the shared pool of CPUs. + +Infrastructure pods include any pods that belong to the kube-system, +armada, cert-manager, vault, platform-deployment-manager, portieris, +notification, flux-helm or metrics-server namespaces. + +The implementation is a bit simplistic, it is assumed that the +"reserved" cpuset is large enough to handle all infrastructure pods +CPU allocations. + +This also prevents infrastucture pods from using Guaranteed resources. + +Co-authored-by: Jim Gauld +Signed-off-by: Gleb Aronsky +Signed-off-by: Thiago Miranda +Signed-off-by: Kaustubh Dhokte +Signed-off-by: Ramesh Kumar Sivanandam +Signed-off-by: Sachin Gopala Krishna +--- + pkg/kubelet/cm/cpumanager/policy_static.go | 50 ++++++++++++++++--- + .../cm/cpumanager/policy_static_test.go | 19 ++++++- + 2 files changed, 62 insertions(+), 7 deletions(-) + +diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go +index 4c4164a9099..180d018565c 100644 +--- a/pkg/kubelet/cm/cpumanager/policy_static.go ++++ b/pkg/kubelet/cm/cpumanager/policy_static.go +@@ -56,6 +56,11 @@ func (e SMTAlignmentError) Type() string { + return ErrorSMTAlignment + } + ++// Define namespaces used by platform infrastructure pods ++var infraNamespaces = [...]string{ ++ "kube-system", "armada", "cert-manager", "platform-deployment-manager", "portieris", "vault", "notification", "flux-helm", "metrics-server", ++} ++ + // staticPolicy is a CPU manager policy that does not change CPU + // assignments for exclusively pinned guaranteed containers after the main + // container process starts. +@@ -128,11 +133,11 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv + klog.InfoS("Static policy created with configuration", "options", opts) + + policy := &staticPolicy{ +- topology: topology, +- affinity: affinity, ++ topology: topology, ++ affinity: affinity, + excludeReserved: excludeReserved, +- cpusToReuse: make(map[string]cpuset.CPUSet), +- options: opts, ++ cpusToReuse: make(map[string]cpuset.CPUSet), ++ options: opts, + } + + allCPUs := topology.CPUDetails.CPUs() +@@ -200,8 +205,8 @@ func (p *staticPolicy) validateState(s state.State) error { + // - user tampered with file + if !p.excludeReserved { + if !p.reserved.Intersection(tmpDefaultCPUset).Equals(p.reserved) { +- return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"", +- p.reserved.String(), tmpDefaultCPUset.String()) ++ return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"", ++ p.reserved.String(), tmpDefaultCPUset.String()) + } + } + // 2. Check if state for static policy is consistent +@@ -276,6 +281,25 @@ func (p *staticPolicy) updateCPUsToReuse(pod *v1.Pod, container *v1.Container, c + } + + func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) (rerr error) { ++ // Process infra pods before guaranteed pods ++ if isKubeInfra(pod) { ++ // Container belongs in reserved pool. ++ // We don't want to fall through to the p.guaranteedCPUs() clause below so return either nil or error. ++ if _, ok := s.GetCPUSet(string(pod.UID), container.Name); ok { ++ klog.Infof("[cpumanager] static policy: reserved container already present in state, skipping (namespace: %s, pod UID: %s, pod: %s, container: %s)", pod.Namespace, string(pod.UID), pod.Name, container.Name) ++ return nil ++ } ++ ++ cpuset := p.reserved ++ if cpuset.IsEmpty() { ++ // If this happens then someone messed up. ++ return fmt.Errorf("[cpumanager] static policy: reserved container unable to allocate cpus (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v, reserved:%v", pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset, p.reserved) ++ } ++ s.SetCPUSet(string(pod.UID), container.Name, cpuset) ++ klog.Infof("[cpumanager] static policy: reserved: AddContainer (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v", pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset) ++ return nil ++ } ++ + numCPUs := p.guaranteedCPUs(pod, container) + if numCPUs == 0 { + // container belongs in the shared pool (nothing to do; use default cpuset) +@@ -401,6 +425,10 @@ func (p *staticPolicy) guaranteedCPUs(pod *v1.Pod, container *v1.Container) int + if cpuQuantity.Value()*1000 != cpuQuantity.MilliValue() { + return 0 + } ++ // Infrastructure pods use reserved CPUs even if they're in the Guaranteed QoS class ++ if isKubeInfra(pod) { ++ return 0 ++ } + // Safe downcast to do for all systems with < 2.1 billion CPUs. + // Per the language spec, `int` is guaranteed to be at least 32 bits wide. + // https://golang.org/ref/spec#Numeric_types +@@ -619,6 +647,16 @@ func (p *staticPolicy) generateCPUTopologyHints(availableCPUs cpuset.CPUSet, reu + return hints + } + ++// check if a given pod is in a platform infrastructure namespace ++func isKubeInfra(pod *v1.Pod) bool { ++ for _, namespace := range infraNamespaces { ++ if namespace == pod.Namespace { ++ return true ++ } ++ } ++ return false ++} ++ + // isHintSocketAligned function return true if numa nodes in hint are socket aligned. + func (p *staticPolicy) isHintSocketAligned(hint topologymanager.TopologyHint, minAffinitySize int) bool { + numaNodesBitMask := hint.NUMANodeAffinity.GetBits() +diff --git a/pkg/kubelet/cm/cpumanager/policy_static_test.go b/pkg/kubelet/cm/cpumanager/policy_static_test.go +index 80a0c5a9e70..414e5ce144c 100644 +--- a/pkg/kubelet/cm/cpumanager/policy_static_test.go ++++ b/pkg/kubelet/cm/cpumanager/policy_static_test.go +@@ -939,7 +939,8 @@ func TestStaticPolicyStartWithResvList(t *testing.T) { + } + + func TestStaticPolicyAddWithResvList(t *testing.T) { +- ++ infraPod := makePod("fakePod", "fakeContainer2", "200m", "200m") ++ infraPod.Namespace = "kube-system" + testCases := []staticPolicyTestWithResvList{ + { + description: "GuPodSingleCore, SingleSocketHT, ExpectError", +@@ -981,6 +982,22 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + expCPUAlloc: true, + expCSet: cpuset.NewCPUSet(4, 5), + }, ++ { ++ description: "InfraPod, SingleSocketHT, ExpectAllocReserved", ++ topo: topoSingleSocketHT, ++ numReservedCPUs: 2, ++ reserved: cpuset.NewCPUSet(0, 1), ++ stAssignments: state.ContainerCPUAssignments{ ++ "fakePod": map[string]cpuset.CPUSet{ ++ "fakeContainer100": cpuset.NewCPUSet(2, 3, 6, 7), ++ }, ++ }, ++ stDefaultCPUSet: cpuset.NewCPUSet(4, 5), ++ pod: infraPod, ++ expErr: nil, ++ expCPUAlloc: true, ++ expCSet: cpuset.NewCPUSet(0, 1), ++ }, + } + + testExcl := true +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch b/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch new file mode 100644 index 000000000..6485b4ab3 --- /dev/null +++ b/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch @@ -0,0 +1,744 @@ +From ed1f8c6a04e7fed096eaae5081c2b5e0c3bc6fed Mon Sep 17 00:00:00 2001 +From: Ramesh Kumar Sivanandam +Date: Mon, 7 Nov 2022 13:33:03 -0500 +Subject: [PATCH 08/10] kubelet cpumanager introduce concept of isolated CPUs + +This introduces the concept of "isolated CPUs", which are CPUs that +have been isolated at the kernel level via the "isolcpus" kernel boot +parameter. + +When starting the kubelet process, two separate sets of reserved CPUs +may be specified. With this change CPUs reserved via +'--system-reserved=cpu' will be used for infrastructure pods while the +isolated CPUs should be reserved via '--kube-reserved=cpu' to cause +kubelet to skip over them for "normal" CPU resource tracking. The +kubelet code will double-check that the specified isolated CPUs match +what the kernel exposes in "/sys/devices/system/cpu/isolated". + +A plugin (outside the scope of this commit) will expose the isolated +CPUs to kubelet via the device plugin API. + +If a pod specifies some number of "isolcpus" resources, the device +manager will allocate them. In this code we check whether such +resources have been allocated, and if so we set the container cpuset to +the isolated CPUs. This does mean that it really only makes sense to +specify "isolcpus" resources for best-effort or burstable pods, not for +guaranteed ones since that would throw off the accounting code. In +order to ensure the accounting still works as designed, if "isolcpus" +are specified for guaranteed pods, the affinity will be set to the +non-isolated CPUs. + +This patch was refactored in 1.21.3 due to upstream API change +node: podresources: make GetDevices() consistent +(commit ad68f9588c72d6477b5a290c548a9031063ac659). + +The routine podIsolCPUs() was refactored in 1.21.3 since the API +p.deviceManager.GetDevices() is returning multiple devices with +a device per cpu. The resultant cpuset needs to be the aggregate. + +The routine NewStaticPolicy was refactored in 1.22.5, adding a new argument +in its signature: cpuPolicyOptions map[string]string. This change is implies +shifting the new arguments(deviceManager, excludeReserved) with one position +to the right. + +Co-authored-by: Jim Gauld +Co-authored-by: Chris Friesen +Signed-off-by: Gleb Aronsky +Signed-off-by: Ramesh Kumar Sivanandam +Signed-off-by: Sachin Gopala Krishna +--- + pkg/kubelet/cm/container_manager_linux.go | 1 + + pkg/kubelet/cm/cpumanager/cpu_manager.go | 35 ++++++- + pkg/kubelet/cm/cpumanager/cpu_manager_test.go | 23 ++++- + pkg/kubelet/cm/cpumanager/policy_static.go | 83 ++++++++++++++-- + .../cm/cpumanager/policy_static_test.go | 53 ++++++++-- + pkg/kubelet/cm/devicemanager/manager_stub.go | 99 +++++++++++++++++++ + 6 files changed, 273 insertions(+), 21 deletions(-) + create mode 100644 pkg/kubelet/cm/devicemanager/manager_stub.go + +diff --git a/pkg/kubelet/cm/container_manager_linux.go b/pkg/kubelet/cm/container_manager_linux.go +index 332e99e72ee..5ac571d9abd 100644 +--- a/pkg/kubelet/cm/container_manager_linux.go ++++ b/pkg/kubelet/cm/container_manager_linux.go +@@ -331,6 +331,7 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I + cm.GetNodeAllocatableReservation(), + nodeConfig.KubeletRootDir, + cm.topologyManager, ++ cm.deviceManager, + ) + if err != nil { + klog.ErrorS(err, "Failed to initialize cpu manager") +diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go +index e2c89efeb2e..95a4246e840 100644 +--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go ++++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go +@@ -19,7 +19,9 @@ package cpumanager + import ( + "context" + "fmt" ++ "io/ioutil" + "math" ++ "strings" + "sync" + "time" + +@@ -33,6 +35,7 @@ import ( + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state" + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology" + "k8s.io/kubernetes/pkg/kubelet/cm/cpuset" ++ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" + "k8s.io/kubernetes/pkg/kubelet/config" + kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" +@@ -51,6 +54,25 @@ type policyName string + // cpuManagerStateFileName is the file name where cpu manager stores its state + const cpuManagerStateFileName = "cpu_manager_state" + ++// get the system-level isolated CPUs ++func getIsolcpus() cpuset.CPUSet { ++ dat, err := ioutil.ReadFile("/sys/devices/system/cpu/isolated") ++ if err != nil { ++ klog.Errorf("[cpumanager] unable to read sysfs isolcpus subdir") ++ return cpuset.NewCPUSet() ++ } ++ ++ // The isolated cpus string ends in a newline ++ cpustring := strings.TrimSuffix(string(dat), "\n") ++ cset, err := cpuset.Parse(cpustring) ++ if err != nil { ++ klog.Errorf("[cpumanager] unable to parse sysfs isolcpus string to cpuset") ++ return cpuset.NewCPUSet() ++ } ++ ++ return cset ++} ++ + // Manager interface provides methods for Kubelet to manage pod cpus. + type Manager interface { + // Start is called during Kubelet initialization. +@@ -154,7 +176,8 @@ func (s *sourcesReadyStub) AddSource(source string) {} + func (s *sourcesReadyStub) AllReady() bool { return true } + + // NewManager creates new cpu manager based on provided policy +-func NewManager(cpuPolicyName string, cpuPolicyOptions map[string]string, reconcilePeriod time.Duration, machineInfo *cadvisorapi.MachineInfo, specificCPUs cpuset.CPUSet, nodeAllocatableReservation v1.ResourceList, stateFileDirectory string, affinity topologymanager.Store) (Manager, error) { ++func NewManager(cpuPolicyName string, cpuPolicyOptions map[string]string, reconcilePeriod time.Duration, machineInfo *cadvisorapi.MachineInfo, specificCPUs cpuset.CPUSet, nodeAllocatableReservation v1.ResourceList, stateFileDirectory string, affinity topologymanager.Store, deviceManager devicemanager.Manager) (Manager, error) { ++ + var topo *topology.CPUTopology + var policy Policy + var err error +@@ -195,7 +218,15 @@ func NewManager(cpuPolicyName string, cpuPolicyOptions map[string]string, reconc + // NOTE: Set excludeReserved unconditionally to exclude reserved CPUs from default cpuset. + // This variable is primarily to make testing easier. + excludeReserved := true +- policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, affinity, cpuPolicyOptions, excludeReserved) ++ ++ // isolCPUs is the set of kernel-isolated CPUs. They should be a subset of specificCPUs or ++ // of the CPUs that NewStaticPolicy() will pick if numReservedCPUs is set. It's only in the ++ // argument list here for ease of testing, it's really internal to the policy. ++ isolCPUs := getIsolcpus() ++ policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, isolCPUs, affinity, cpuPolicyOptions, deviceManager, excludeReserved) ++ if err != nil { ++ return nil, fmt.Errorf("new static policy error: %v", err) ++ } + + if err != nil { + return nil, fmt.Errorf("new static policy error: %w", err) +diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go +index e7c74453472..78b4ada1a73 100644 +--- a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go ++++ b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go +@@ -37,6 +37,7 @@ import ( + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state" + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology" + "k8s.io/kubernetes/pkg/kubelet/cm/cpuset" ++ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" + ) + +@@ -215,6 +216,7 @@ func makeMultiContainerPod(initCPUs, appCPUs []struct{ request, limit string }) + } + + func TestCPUManagerAdd(t *testing.T) { ++ testDM, _ := devicemanager.NewManagerStub() + testExcl := false + testPolicy, _ := NewStaticPolicy( + &topology.CPUTopology{ +@@ -230,8 +232,10 @@ func TestCPUManagerAdd(t *testing.T) { + }, + 0, + cpuset.NewCPUSet(), ++ cpuset.NewCPUSet(), + topologymanager.NewFakeManager(), + nil, ++ testDM, + testExcl) + testCases := []struct { + description string +@@ -482,8 +486,9 @@ func TestCPUManagerAddWithInitContainers(t *testing.T) { + } + + testExcl := false ++ testDM, _ := devicemanager.NewManagerStub() + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testExcl) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testDM, testExcl) + + mockState := &mockState{ + assignments: testCase.stAssignments, +@@ -638,7 +643,9 @@ func TestCPUManagerGenerate(t *testing.T) { + } + defer os.RemoveAll(sDir) + +- mgr, err := NewManager(testCase.cpuPolicyName, nil, 5*time.Second, machineInfo, cpuset.NewCPUSet(), testCase.nodeAllocatableReservation, sDir, topologymanager.NewFakeManager()) ++ testDM, err := devicemanager.NewManagerStub() ++ mgr, err := NewManager(testCase.cpuPolicyName, nil, 5*time.Second, machineInfo, cpuset.NewCPUSet(), testCase.nodeAllocatableReservation, sDir, topologymanager.NewFakeManager(), testDM) ++ + if testCase.expectedError != nil { + if !strings.Contains(err.Error(), testCase.expectedError.Error()) { + t.Errorf("Unexpected error message. Have: %s wants %s", err.Error(), testCase.expectedError.Error()) +@@ -709,6 +716,7 @@ func TestCPUManagerRemove(t *testing.T) { + + func TestReconcileState(t *testing.T) { + testExcl := false ++ testDM, _ := devicemanager.NewManagerStub() + testPolicy, _ := NewStaticPolicy( + &topology.CPUTopology{ + NumCPUs: 8, +@@ -727,8 +735,10 @@ func TestReconcileState(t *testing.T) { + }, + 0, + cpuset.NewCPUSet(), ++ cpuset.NewCPUSet(), + topologymanager.NewFakeManager(), + nil, ++ testDM, + testExcl) + + testCases := []struct { +@@ -1234,6 +1244,7 @@ func TestReconcileState(t *testing.T) { + // the following tests are with --reserved-cpus configured + func TestCPUManagerAddWithResvList(t *testing.T) { + testExcl := false ++ testDM, _ := devicemanager.NewManagerStub() + testPolicy, _ := NewStaticPolicy( + &topology.CPUTopology{ + NumCPUs: 4, +@@ -1248,8 +1259,10 @@ func TestCPUManagerAddWithResvList(t *testing.T) { + }, + 1, + cpuset.NewCPUSet(0), ++ cpuset.NewCPUSet(), + topologymanager.NewFakeManager(), + nil, ++ testDM, + testExcl) + testCases := []struct { + description string +@@ -1362,7 +1375,8 @@ func TestCPUManagerHandlePolicyOptions(t *testing.T) { + } + defer os.RemoveAll(sDir) + +- _, err = NewManager(testCase.cpuPolicyName, testCase.cpuPolicyOptions, 5*time.Second, machineInfo, cpuset.NewCPUSet(), nodeAllocatableReservation, sDir, topologymanager.NewFakeManager()) ++ testDM, err := devicemanager.NewManagerStub() ++ _, err = NewManager(testCase.cpuPolicyName, testCase.cpuPolicyOptions, 5*time.Second, machineInfo, cpuset.NewCPUSet(), nodeAllocatableReservation, sDir, topologymanager.NewFakeManager(), testDM) + if err == nil { + t.Errorf("Expected error, but NewManager succeeded") + } +@@ -1376,6 +1390,7 @@ func TestCPUManagerHandlePolicyOptions(t *testing.T) { + + func TestCPUManagerGetAllocatableCPUs(t *testing.T) { + testExcl := false ++ testDm, _ := devicemanager.NewManagerStub() + nonePolicy, _ := NewNonePolicy(nil) + staticPolicy, _ := NewStaticPolicy( + &topology.CPUTopology{ +@@ -1391,8 +1406,10 @@ func TestCPUManagerGetAllocatableCPUs(t *testing.T) { + }, + 1, + cpuset.NewCPUSet(0), ++ cpuset.NewCPUSet(), + topologymanager.NewFakeManager(), + nil, ++ testDm, + testExcl) + + testCases := []struct { +diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go +index 180d018565c..8d18ce65309 100644 +--- a/pkg/kubelet/cm/cpumanager/policy_static.go ++++ b/pkg/kubelet/cm/cpumanager/policy_static.go +@@ -18,6 +18,7 @@ package cpumanager + + import ( + "fmt" ++ "strconv" + + v1 "k8s.io/api/core/v1" + "k8s.io/klog/v2" +@@ -26,6 +27,7 @@ import ( + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state" + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology" + "k8s.io/kubernetes/pkg/kubelet/cm/cpuset" ++ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask" + "k8s.io/kubernetes/pkg/kubelet/metrics" +@@ -104,6 +106,10 @@ type staticPolicy struct { + topology *topology.CPUTopology + // set of CPUs that is not available for exclusive assignment + reserved cpuset.CPUSet ++ // subset of reserved CPUs with isolcpus attribute ++ isolcpus cpuset.CPUSet ++ // parent containerManager, used to get device list ++ deviceManager devicemanager.Manager + // If true, default CPUSet should exclude reserved CPUs + excludeReserved bool + // topology manager reference to get container Topology affinity +@@ -120,7 +126,8 @@ var _ Policy = &staticPolicy{} + // NewStaticPolicy returns a CPU manager policy that does not change CPU + // assignments for exclusively pinned guaranteed containers after the main + // container process starts. +-func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store, cpuPolicyOptions map[string]string, excludeReserved bool) (Policy, error) { ++func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, isolCPUs cpuset.CPUSet, affinity topologymanager.Store, cpuPolicyOptions map[string]string, deviceManager devicemanager.Manager, excludeReserved bool) (Policy, error) { ++ + opts, err := NewStaticPolicyOptions(cpuPolicyOptions) + if err != nil { + return nil, err +@@ -135,6 +142,8 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv + policy := &staticPolicy{ + topology: topology, + affinity: affinity, ++ isolcpus: isolCPUs, ++ deviceManager: deviceManager, + excludeReserved: excludeReserved, + cpusToReuse: make(map[string]cpuset.CPUSet), + options: opts, +@@ -161,6 +170,12 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv + klog.InfoS("Reserved CPUs not available for exclusive assignment", "reservedSize", reserved.Size(), "reserved", reserved) + policy.reserved = reserved + ++ if !isolCPUs.IsSubsetOf(reserved) { ++ klog.Errorf("[cpumanager] isolCPUs %v is not a subset of reserved %v", isolCPUs, reserved) ++ reserved = reserved.Union(isolCPUs) ++ klog.Warningf("[cpumanager] mismatch isolCPUs %v, force reserved %v", isolCPUs, reserved) ++ } ++ + return policy, nil + } + +@@ -194,8 +209,9 @@ func (p *staticPolicy) validateState(s state.State) error { + } else { + s.SetDefaultCPUSet(allCPUs) + } +- klog.Infof("[cpumanager] static policy: CPUSet: allCPUs:%v, reserved:%v, default:%v\n", +- allCPUs, p.reserved, s.GetDefaultCPUSet()) ++ klog.Infof("[cpumanager] static policy: CPUSet: allCPUs:%v, reserved:%v, isolcpus:%v, default:%v\n", ++ allCPUs, p.reserved, p.isolcpus, s.GetDefaultCPUSet()) ++ + return nil + } + +@@ -290,16 +306,39 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai + return nil + } + +- cpuset := p.reserved ++ cpuset := p.reserved.Clone().Difference(p.isolcpus) + if cpuset.IsEmpty() { + // If this happens then someone messed up. +- return fmt.Errorf("[cpumanager] static policy: reserved container unable to allocate cpus (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v, reserved:%v", pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset, p.reserved) ++ return fmt.Errorf("[cpumanager] static policy: reserved container unable to allocate cpus (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v, reserved:%v, isolcpus:%v", pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset, p.reserved, p.isolcpus) ++ + } + s.SetCPUSet(string(pod.UID), container.Name, cpuset) + klog.Infof("[cpumanager] static policy: reserved: AddContainer (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v", pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset) + return nil + } + ++ if isolcpus := p.podIsolCPUs(pod, container); isolcpus.Size() > 0 { ++ // container has requested isolated CPUs ++ if set, ok := s.GetCPUSet(string(pod.UID), container.Name); ok { ++ if set.Equals(isolcpus) { ++ klog.Infof("[cpumanager] isolcpus container already present in state, skipping (namespace: %s, pod UID: %s, pod: %s, container: %s)", ++ pod.Namespace, string(pod.UID), pod.Name, container.Name) ++ return nil ++ } else { ++ klog.Infof("[cpumanager] isolcpus container state has cpus %v, should be %v (namespace: %s, pod UID: %s, pod: %s, container: %s)", ++ isolcpus, set, pod.Namespace, string(pod.UID), pod.Name, container.Name) ++ } ++ } ++ // Note that we do not do anything about init containers here. ++ // It looks like devices are allocated per-pod based on effective requests/limits ++ // and extra devices from initContainers are not freed up when the regular containers start. ++ // TODO: confirm this is still true for 1.20 ++ s.SetCPUSet(string(pod.UID), container.Name, isolcpus) ++ klog.Infof("[cpumanager] isolcpus: AddContainer (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v", ++ pod.Namespace, string(pod.UID), pod.Name, container.Name, isolcpus) ++ return nil ++ } ++ + numCPUs := p.guaranteedCPUs(pod, container) + if numCPUs == 0 { + // container belongs in the shared pool (nothing to do; use default cpuset) +@@ -348,7 +387,9 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai + } + s.SetCPUSet(string(pod.UID), container.Name, cpuset) + p.updateCPUsToReuse(pod, container, cpuset) +- ++ klog.Infof("[cpumanager] guaranteed: AddContainer "+ ++ "(namespace: %s, pod UID: %s, pod: %s, container: %s); numCPUS=%d, cpuset=%v", ++ pod.Namespace, string(pod.UID), pod.Name, container.Name, numCPUs, cpuset) + return nil + } + +@@ -657,6 +698,36 @@ func isKubeInfra(pod *v1.Pod) bool { + return false + } + ++// get the isolated CPUs (if any) from the devices associated with a specific container ++func (p *staticPolicy) podIsolCPUs(pod *v1.Pod, container *v1.Container) cpuset.CPUSet { ++ // NOTE: This is required for TestStaticPolicyAdd() since makePod() does ++ // not create UID. We also need a way to properly stub devicemanager. ++ if len(string(pod.UID)) == 0 { ++ return cpuset.NewCPUSet() ++ } ++ resContDevices := p.deviceManager.GetDevices(string(pod.UID), container.Name) ++ cpuSet := cpuset.NewCPUSet() ++ for resourceName, resourceDevs := range resContDevices { ++ // this resource name needs to match the isolcpus device plugin ++ if resourceName == "windriver.com/isolcpus" { ++ for devID, _ := range resourceDevs { ++ cpuStrList := []string{devID} ++ if len(cpuStrList) > 0 { ++ // loop over the list of strings, convert each one to int, add to cpuset ++ for _, cpuStr := range cpuStrList { ++ cpu, err := strconv.Atoi(cpuStr) ++ if err != nil { ++ panic(err) ++ } ++ cpuSet = cpuSet.Union(cpuset.NewCPUSet(cpu)) ++ } ++ } ++ } ++ } ++ } ++ return cpuSet ++} ++ + // isHintSocketAligned function return true if numa nodes in hint are socket aligned. + func (p *staticPolicy) isHintSocketAligned(hint topologymanager.TopologyHint, minAffinitySize int) bool { + numaNodesBitMask := hint.NUMANodeAffinity.GetBits() +diff --git a/pkg/kubelet/cm/cpumanager/policy_static_test.go b/pkg/kubelet/cm/cpumanager/policy_static_test.go +index 414e5ce144c..1c43df3b85f 100644 +--- a/pkg/kubelet/cm/cpumanager/policy_static_test.go ++++ b/pkg/kubelet/cm/cpumanager/policy_static_test.go +@@ -28,6 +28,7 @@ import ( + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state" + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology" + "k8s.io/kubernetes/pkg/kubelet/cm/cpuset" ++ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask" + ) +@@ -69,8 +70,9 @@ func (spt staticPolicyTest) PseudoClone() staticPolicyTest { + } + + func TestStaticPolicyName(t *testing.T) { ++ testDM, _ := devicemanager.NewManagerStub() + testExcl := false +- policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testExcl) ++ policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.NewCPUSet(), cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testDM, testExcl) + + policyName := policy.Name() + if policyName != "static" { +@@ -80,6 +82,7 @@ func TestStaticPolicyName(t *testing.T) { + } + + func TestStaticPolicyStart(t *testing.T) { ++ testDM, _ := devicemanager.NewManagerStub() + testCases := []staticPolicyTest{ + { + description: "non-corrupted state", +@@ -155,7 +158,7 @@ func TestStaticPolicyStart(t *testing.T) { + } + for _, testCase := range testCases { + t.Run(testCase.description, func(t *testing.T) { +- p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testCase.excludeReserved) ++ p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testDM, testCase.excludeReserved) + + policy := p.(*staticPolicy) + st := &mockState{ +@@ -203,7 +206,6 @@ func TestStaticPolicyAdd(t *testing.T) { + largeTopoCPUSet := largeTopoBuilder.Result() + largeTopoSock0CPUSet := largeTopoSock0Builder.Result() + largeTopoSock1CPUSet := largeTopoSock1Builder.Result() +- + // these are the cases which must behave the same regardless the policy options. + // So we will permutate the options to ensure this holds true. + +@@ -577,8 +579,10 @@ func runStaticPolicyTestCase(t *testing.T, testCase staticPolicyTest) { + if testCase.topologyHint != nil { + tm = topologymanager.NewFakeManagerWithHint(testCase.topologyHint) + } ++ testDM, _ := devicemanager.NewManagerStub() + testExcl := false +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), tm, testCase.options, testExcl) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), cpuset.NewCPUSet(), tm, testCase.options, testDM, testExcl) ++ + + st := &mockState{ + assignments: testCase.stAssignments, +@@ -625,6 +629,8 @@ func runStaticPolicyTestCaseWithFeatureGate(t *testing.T, testCase staticPolicyT + } + + func TestStaticPolicyReuseCPUs(t *testing.T) { ++ excludeReserved := false ++ testDM, _ := devicemanager.NewManagerStub() + testCases := []struct { + staticPolicyTest + expCSetAfterAlloc cpuset.CPUSet +@@ -649,7 +655,7 @@ func TestStaticPolicyReuseCPUs(t *testing.T) { + } + + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testCase.excludeReserved) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testDM, excludeReserved) + + st := &mockState{ + assignments: testCase.stAssignments, +@@ -682,6 +688,7 @@ func TestStaticPolicyReuseCPUs(t *testing.T) { + + func TestStaticPolicyRemove(t *testing.T) { + excludeReserved := false ++ testDM, _ := devicemanager.NewManagerStub() + testCases := []staticPolicyTest{ + { + description: "SingleSocketHT, DeAllocOneContainer", +@@ -740,7 +747,7 @@ func TestStaticPolicyRemove(t *testing.T) { + } + + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, excludeReserved) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testDM, excludeReserved) + + st := &mockState{ + assignments: testCase.stAssignments, +@@ -763,6 +770,7 @@ func TestStaticPolicyRemove(t *testing.T) { + + func TestTopologyAwareAllocateCPUs(t *testing.T) { + excludeReserved := false ++ testDM, _ := devicemanager.NewManagerStub() + testCases := []struct { + description string + topo *topology.CPUTopology +@@ -831,7 +839,8 @@ func TestTopologyAwareAllocateCPUs(t *testing.T) { + }, + } + for _, tc := range testCases { +- p, _ := NewStaticPolicy(tc.topo, 0, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, excludeReserved) ++ p, _ := NewStaticPolicy(tc.topo, 0, cpuset.NewCPUSet(), cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testDM, excludeReserved) ++ + policy := p.(*staticPolicy) + st := &mockState{ + assignments: tc.stAssignments, +@@ -864,6 +873,7 @@ type staticPolicyTestWithResvList struct { + topo *topology.CPUTopology + numReservedCPUs int + reserved cpuset.CPUSet ++ isolcpus cpuset.CPUSet + stAssignments state.ContainerCPUAssignments + stDefaultCPUSet cpuset.CPUSet + pod *v1.Pod +@@ -874,6 +884,8 @@ type staticPolicyTestWithResvList struct { + } + + func TestStaticPolicyStartWithResvList(t *testing.T) { ++ testDM, _ := devicemanager.NewManagerStub() ++ testExcl := false + testCases := []staticPolicyTestWithResvList{ + { + description: "empty cpuset", +@@ -903,11 +915,10 @@ func TestStaticPolicyStartWithResvList(t *testing.T) { + expNewErr: fmt.Errorf("[cpumanager] unable to reserve the required amount of CPUs (size of 0-1 did not equal 1)"), + }, + } +- testExcl := false + for _, testCase := range testCases { + t.Run(testCase.description, func(t *testing.T) { +- p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil, testExcl) + ++ p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testDM, testExcl) + if !reflect.DeepEqual(err, testCase.expNewErr) { + t.Errorf("StaticPolicy Start() error (%v). expected error: %v but got: %v", + testCase.description, testCase.expNewErr, err) +@@ -947,6 +958,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + topo: topoSingleSocketHT, + numReservedCPUs: 1, + reserved: cpuset.NewCPUSet(0), ++ isolcpus: cpuset.NewCPUSet(), + stAssignments: state.ContainerCPUAssignments{}, + stDefaultCPUSet: cpuset.NewCPUSet(1, 2, 3, 4, 5, 6, 7), + pod: makePod("fakePod", "fakeContainer2", "8000m", "8000m"), +@@ -959,6 +971,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + topo: topoSingleSocketHT, + numReservedCPUs: 2, + reserved: cpuset.NewCPUSet(0, 1), ++ isolcpus: cpuset.NewCPUSet(), + stAssignments: state.ContainerCPUAssignments{}, + stDefaultCPUSet: cpuset.NewCPUSet(2, 3, 4, 5, 6, 7), + pod: makePod("fakePod", "fakeContainer2", "1000m", "1000m"), +@@ -971,6 +984,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + topo: topoSingleSocketHT, + numReservedCPUs: 2, + reserved: cpuset.NewCPUSet(0, 1), ++ isolcpus: cpuset.NewCPUSet(), + stAssignments: state.ContainerCPUAssignments{ + "fakePod": map[string]cpuset.CPUSet{ + "fakeContainer100": cpuset.NewCPUSet(2, 3, 6, 7), +@@ -987,6 +1001,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + topo: topoSingleSocketHT, + numReservedCPUs: 2, + reserved: cpuset.NewCPUSet(0, 1), ++ isolcpus: cpuset.NewCPUSet(), + stAssignments: state.ContainerCPUAssignments{ + "fakePod": map[string]cpuset.CPUSet{ + "fakeContainer100": cpuset.NewCPUSet(2, 3, 6, 7), +@@ -998,11 +1013,29 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + expCPUAlloc: true, + expCSet: cpuset.NewCPUSet(0, 1), + }, ++ { ++ description: "InfraPod, SingleSocketHT, Isolcpus, ExpectAllocReserved", ++ topo: topoSingleSocketHT, ++ numReservedCPUs: 2, ++ reserved: cpuset.NewCPUSet(0, 1), ++ isolcpus: cpuset.NewCPUSet(1), ++ stAssignments: state.ContainerCPUAssignments{ ++ "fakePod": map[string]cpuset.CPUSet{ ++ "fakeContainer100": cpuset.NewCPUSet(2, 3, 6, 7), ++ }, ++ }, ++ stDefaultCPUSet: cpuset.NewCPUSet(4, 5), ++ pod: infraPod, ++ expErr: nil, ++ expCPUAlloc: true, ++ expCSet: cpuset.NewCPUSet(0), ++ }, + } + + testExcl := true ++ testDM, _ := devicemanager.NewManagerStub() + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil, testExcl) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, testCase.isolcpus, topologymanager.NewFakeManager(), nil, testDM, testExcl) + + st := &mockState{ + assignments: testCase.stAssignments, +diff --git a/pkg/kubelet/cm/devicemanager/manager_stub.go b/pkg/kubelet/cm/devicemanager/manager_stub.go +new file mode 100644 +index 00000000000..e6874f88d8a +--- /dev/null ++++ b/pkg/kubelet/cm/devicemanager/manager_stub.go +@@ -0,0 +1,99 @@ ++/* ++Copyright 2017 The Kubernetes Authors. ++ ++Licensed under the Apache License, Version 2.0 (the "License"); ++you may not use this file except in compliance with the License. ++You may obtain a copy of the License at ++ ++ http://www.apache.org/licenses/LICENSE-2.0 ++ ++Unless required by applicable law or agreed to in writing, software ++distributed under the License is distributed on an "AS IS" BASIS, ++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++See the License for the specific language governing permissions and ++limitations under the License. ++*/ ++ ++package devicemanager ++ ++import ( ++ v1 "k8s.io/api/core/v1" ++ "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" ++ "k8s.io/kubernetes/pkg/kubelet/config" ++ "k8s.io/kubernetes/pkg/kubelet/lifecycle" ++ "k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache" ++ schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" ++) ++ ++// ManagerStub provides a simple stub implementation for the Device Manager. ++type ManagerStub struct{} ++ ++// NewManagerStub creates a ManagerStub. ++func NewManagerStub() (*ManagerStub, error) { ++ return &ManagerStub{}, nil ++} ++ ++// Start simply returns nil. ++func (h *ManagerStub) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady) error { ++ return nil ++} ++ ++// Stop simply returns nil. ++func (h *ManagerStub) Stop() error { ++ return nil ++} ++ ++// Allocate simply returns nil. ++func (h *ManagerStub) Allocate(pod *v1.Pod, container *v1.Container) error { ++ return nil ++} ++ ++// UpdatePluginResources simply returns nil. ++func (h *ManagerStub) UpdatePluginResources(node *schedulerframework.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error { ++ return nil ++} ++ ++// GetDeviceRunContainerOptions simply returns nil. ++func (h *ManagerStub) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) (*DeviceRunContainerOptions, error) { ++ return nil, nil ++} ++ ++// GetCapacity simply returns nil capacity and empty removed resource list. ++func (h *ManagerStub) GetCapacity() (v1.ResourceList, v1.ResourceList, []string) { ++ return nil, nil, []string{} ++} ++ ++// GetWatcherHandler returns plugin watcher interface ++func (h *ManagerStub) GetWatcherHandler() cache.PluginHandler { ++ return nil ++} ++ ++// GetTopologyHints returns an empty TopologyHint map ++func (h *ManagerStub) GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint { ++ return map[string][]topologymanager.TopologyHint{} ++} ++ ++// GetPodTopologyHints returns an empty TopologyHint map ++func (h *ManagerStub) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymanager.TopologyHint { ++ return map[string][]topologymanager.TopologyHint{} ++} ++ ++// GetDevices returns nil ++func (h *ManagerStub) GetDevices(_, _ string) ResourceDeviceInstances { ++ return nil ++} ++ ++// GetAllocatableDevices returns nothing ++func (h *ManagerStub) GetAllocatableDevices() ResourceDeviceInstances { ++ return nil ++} ++ ++// ShouldResetExtendedResourceCapacity returns false ++func (h *ManagerStub) ShouldResetExtendedResourceCapacity() bool { ++ return false ++} ++ ++// UpdateAllocatedDevices returns nothing ++func (h *ManagerStub) UpdateAllocatedDevices() { ++ return ++} +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/kubelet-cpumanager-keep-normal-containers-off-reserv.patch b/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/kubelet-cpumanager-keep-normal-containers-off-reserv.patch new file mode 100644 index 000000000..383e0a232 --- /dev/null +++ b/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/kubelet-cpumanager-keep-normal-containers-off-reserv.patch @@ -0,0 +1,356 @@ +From 7b4e8029de25b57c25b510178a41ceddf556d428 Mon Sep 17 00:00:00 2001 +From: Ramesh Kumar Sivanandam +Date: Mon, 7 Nov 2022 08:43:43 -0500 +Subject: [PATCH 06/10] kubelet cpumanager keep normal containers off reserved CPUs + +When starting the kubelet process, two separate sets of reserved CPUs +may be specified. With this change CPUs reserved via +'--system-reserved=cpu' +or '--kube-reserved=cpu' will be ignored by kubernetes itself. A small +tweak to the default CPU affinity ensures that "normal" Kubernetes +pods won't run on the reserved CPUs. + +Co-authored-by: Jim Gauld +Signed-off-by: Sachin Gopala Krishna +Signed-off-by: Ramesh Kumar Sivanandam +--- + pkg/kubelet/cm/cpumanager/cpu_manager.go | 6 ++- + pkg/kubelet/cm/cpumanager/cpu_manager_test.go | 19 +++++++--- + pkg/kubelet/cm/cpumanager/policy_static.go | 30 ++++++++++++--- + .../cm/cpumanager/policy_static_test.go | 38 ++++++++++++++----- + 4 files changed, 71 insertions(+), 22 deletions(-) + +diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go +index 9e2dce60501..e2c89efeb2e 100644 +--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go ++++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go +@@ -192,7 +192,11 @@ func NewManager(cpuPolicyName string, cpuPolicyOptions map[string]string, reconc + // exclusively allocated. + reservedCPUsFloat := float64(reservedCPUs.MilliValue()) / 1000 + numReservedCPUs := int(math.Ceil(reservedCPUsFloat)) +- policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, affinity, cpuPolicyOptions) ++ // NOTE: Set excludeReserved unconditionally to exclude reserved CPUs from default cpuset. ++ // This variable is primarily to make testing easier. ++ excludeReserved := true ++ policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, affinity, cpuPolicyOptions, excludeReserved) ++ + if err != nil { + return nil, fmt.Errorf("new static policy error: %w", err) + } +diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go +index 29941611a53..e7c74453472 100644 +--- a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go ++++ b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go +@@ -215,6 +215,7 @@ func makeMultiContainerPod(initCPUs, appCPUs []struct{ request, limit string }) + } + + func TestCPUManagerAdd(t *testing.T) { ++ testExcl := false + testPolicy, _ := NewStaticPolicy( + &topology.CPUTopology{ + NumCPUs: 4, +@@ -230,7 +231,8 @@ func TestCPUManagerAdd(t *testing.T) { + 0, + cpuset.NewCPUSet(), + topologymanager.NewFakeManager(), +- nil) ++ nil, ++ testExcl) + testCases := []struct { + description string + updateErr error +@@ -479,8 +481,9 @@ func TestCPUManagerAddWithInitContainers(t *testing.T) { + }, + } + ++ testExcl := false + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testExcl) + + mockState := &mockState{ + assignments: testCase.stAssignments, +@@ -705,6 +708,7 @@ func TestCPUManagerRemove(t *testing.T) { + } + + func TestReconcileState(t *testing.T) { ++ testExcl := false + testPolicy, _ := NewStaticPolicy( + &topology.CPUTopology{ + NumCPUs: 8, +@@ -724,7 +728,8 @@ func TestReconcileState(t *testing.T) { + 0, + cpuset.NewCPUSet(), + topologymanager.NewFakeManager(), +- nil) ++ nil, ++ testExcl) + + testCases := []struct { + description string +@@ -1228,6 +1233,7 @@ func TestReconcileState(t *testing.T) { + // above test cases are without kubelet --reserved-cpus cmd option + // the following tests are with --reserved-cpus configured + func TestCPUManagerAddWithResvList(t *testing.T) { ++ testExcl := false + testPolicy, _ := NewStaticPolicy( + &topology.CPUTopology{ + NumCPUs: 4, +@@ -1243,7 +1249,8 @@ func TestCPUManagerAddWithResvList(t *testing.T) { + 1, + cpuset.NewCPUSet(0), + topologymanager.NewFakeManager(), +- nil) ++ nil, ++ testExcl) + testCases := []struct { + description string + updateErr error +@@ -1368,6 +1375,7 @@ func TestCPUManagerHandlePolicyOptions(t *testing.T) { + } + + func TestCPUManagerGetAllocatableCPUs(t *testing.T) { ++ testExcl := false + nonePolicy, _ := NewNonePolicy(nil) + staticPolicy, _ := NewStaticPolicy( + &topology.CPUTopology{ +@@ -1384,7 +1392,8 @@ func TestCPUManagerGetAllocatableCPUs(t *testing.T) { + 1, + cpuset.NewCPUSet(0), + topologymanager.NewFakeManager(), +- nil) ++ nil, ++ testExcl) + + testCases := []struct { + description string +diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go +index 4d7f7c0b0de..4c4164a9099 100644 +--- a/pkg/kubelet/cm/cpumanager/policy_static.go ++++ b/pkg/kubelet/cm/cpumanager/policy_static.go +@@ -99,6 +99,8 @@ type staticPolicy struct { + topology *topology.CPUTopology + // set of CPUs that is not available for exclusive assignment + reserved cpuset.CPUSet ++ // If true, default CPUSet should exclude reserved CPUs ++ excludeReserved bool + // topology manager reference to get container Topology affinity + affinity topologymanager.Store + // set of CPUs to reuse across allocations in a pod +@@ -113,7 +115,7 @@ var _ Policy = &staticPolicy{} + // NewStaticPolicy returns a CPU manager policy that does not change CPU + // assignments for exclusively pinned guaranteed containers after the main + // container process starts. +-func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store, cpuPolicyOptions map[string]string) (Policy, error) { ++func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store, cpuPolicyOptions map[string]string, excludeReserved bool) (Policy, error) { + opts, err := NewStaticPolicyOptions(cpuPolicyOptions) + if err != nil { + return nil, err +@@ -128,6 +130,7 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv + policy := &staticPolicy{ + topology: topology, + affinity: affinity, ++ excludeReserved: excludeReserved, + cpusToReuse: make(map[string]cpuset.CPUSet), + options: opts, + } +@@ -179,7 +182,15 @@ func (p *staticPolicy) validateState(s state.State) error { + } + // state is empty initialize + allCPUs := p.topology.CPUDetails.CPUs() +- s.SetDefaultCPUSet(allCPUs) ++ if p.excludeReserved { ++ // Exclude reserved CPUs from the default CPUSet to keep containers off them ++ // unless explicitly affined. ++ s.SetDefaultCPUSet(allCPUs.Difference(p.reserved)) ++ } else { ++ s.SetDefaultCPUSet(allCPUs) ++ } ++ klog.Infof("[cpumanager] static policy: CPUSet: allCPUs:%v, reserved:%v, default:%v\n", ++ allCPUs, p.reserved, s.GetDefaultCPUSet()) + return nil + } + +@@ -187,11 +198,12 @@ func (p *staticPolicy) validateState(s state.State) error { + // 1. Check if the reserved cpuset is not part of default cpuset because: + // - kube/system reserved have changed (increased) - may lead to some containers not being able to start + // - user tampered with file +- if !p.reserved.Intersection(tmpDefaultCPUset).Equals(p.reserved) { +- return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"", +- p.reserved.String(), tmpDefaultCPUset.String()) ++ if !p.excludeReserved { ++ if !p.reserved.Intersection(tmpDefaultCPUset).Equals(p.reserved) { ++ return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"", ++ p.reserved.String(), tmpDefaultCPUset.String()) ++ } + } +- + // 2. Check if state for static policy is consistent + for pod := range tmpAssignments { + for container, cset := range tmpAssignments[pod] { +@@ -218,6 +230,9 @@ func (p *staticPolicy) validateState(s state.State) error { + } + } + totalKnownCPUs = totalKnownCPUs.UnionAll(tmpCPUSets) ++ if p.excludeReserved { ++ totalKnownCPUs = totalKnownCPUs.Union(p.reserved) ++ } + if !totalKnownCPUs.Equals(p.topology.CPUDetails.CPUs()) { + return fmt.Errorf("current set of available CPUs \"%s\" doesn't match with CPUs in state \"%s\"", + p.topology.CPUDetails.CPUs().String(), totalKnownCPUs.String()) +@@ -331,6 +346,9 @@ func (p *staticPolicy) RemoveContainer(s state.State, podUID string, containerNa + cpusInUse := getAssignedCPUsOfSiblings(s, podUID, containerName) + if toRelease, ok := s.GetCPUSet(podUID, containerName); ok { + s.Delete(podUID, containerName) ++ if p.excludeReserved { ++ toRelease = toRelease.Difference(p.reserved) ++ } + // Mutate the shared pool, adding released cpus. + toRelease = toRelease.Difference(cpusInUse) + s.SetDefaultCPUSet(s.GetDefaultCPUSet().Union(toRelease)) +diff --git a/pkg/kubelet/cm/cpumanager/policy_static_test.go b/pkg/kubelet/cm/cpumanager/policy_static_test.go +index 4c10af065a4..80a0c5a9e70 100644 +--- a/pkg/kubelet/cm/cpumanager/policy_static_test.go ++++ b/pkg/kubelet/cm/cpumanager/policy_static_test.go +@@ -36,6 +36,7 @@ type staticPolicyTest struct { + description string + topo *topology.CPUTopology + numReservedCPUs int ++ excludeReserved bool + podUID string + options map[string]string + containerName string +@@ -68,7 +69,8 @@ func (spt staticPolicyTest) PseudoClone() staticPolicyTest { + } + + func TestStaticPolicyName(t *testing.T) { +- policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil) ++ testExcl := false ++ policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testExcl) + + policyName := policy.Name() + if policyName != "static" { +@@ -98,6 +100,15 @@ func TestStaticPolicyStart(t *testing.T) { + stDefaultCPUSet: cpuset.NewCPUSet(), + expCSet: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), + }, ++ { ++ description: "empty cpuset exclude reserved", ++ topo: topoDualSocketHT, ++ numReservedCPUs: 2, ++ excludeReserved: true, ++ stAssignments: state.ContainerCPUAssignments{}, ++ stDefaultCPUSet: cpuset.NewCPUSet(), ++ expCSet: cpuset.NewCPUSet(1, 2, 3, 4, 5, 7, 8, 9, 10, 11), ++ }, + { + description: "reserved cores 0 & 6 are not present in available cpuset", + topo: topoDualSocketHT, +@@ -144,7 +155,8 @@ func TestStaticPolicyStart(t *testing.T) { + } + for _, testCase := range testCases { + t.Run(testCase.description, func(t *testing.T) { +- p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil) ++ p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testCase.excludeReserved) ++ + policy := p.(*staticPolicy) + st := &mockState{ + assignments: testCase.stAssignments, +@@ -565,7 +577,8 @@ func runStaticPolicyTestCase(t *testing.T, testCase staticPolicyTest) { + if testCase.topologyHint != nil { + tm = topologymanager.NewFakeManagerWithHint(testCase.topologyHint) + } +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), tm, testCase.options) ++ testExcl := false ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), tm, testCase.options, testExcl) + + st := &mockState{ + assignments: testCase.stAssignments, +@@ -636,7 +649,7 @@ func TestStaticPolicyReuseCPUs(t *testing.T) { + } + + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, testCase.excludeReserved) + + st := &mockState{ + assignments: testCase.stAssignments, +@@ -668,6 +681,7 @@ func TestStaticPolicyReuseCPUs(t *testing.T) { + } + + func TestStaticPolicyRemove(t *testing.T) { ++ excludeReserved := false + testCases := []staticPolicyTest{ + { + description: "SingleSocketHT, DeAllocOneContainer", +@@ -726,7 +740,7 @@ func TestStaticPolicyRemove(t *testing.T) { + } + + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, excludeReserved) + + st := &mockState{ + assignments: testCase.stAssignments, +@@ -748,6 +762,7 @@ func TestStaticPolicyRemove(t *testing.T) { + } + + func TestTopologyAwareAllocateCPUs(t *testing.T) { ++ excludeReserved := false + testCases := []struct { + description string + topo *topology.CPUTopology +@@ -816,7 +831,7 @@ func TestTopologyAwareAllocateCPUs(t *testing.T) { + }, + } + for _, tc := range testCases { +- p, _ := NewStaticPolicy(tc.topo, 0, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil) ++ p, _ := NewStaticPolicy(tc.topo, 0, cpuset.NewCPUSet(), topologymanager.NewFakeManager(), nil, excludeReserved) + policy := p.(*staticPolicy) + st := &mockState{ + assignments: tc.stAssignments, +@@ -888,9 +903,11 @@ func TestStaticPolicyStartWithResvList(t *testing.T) { + expNewErr: fmt.Errorf("[cpumanager] unable to reserve the required amount of CPUs (size of 0-1 did not equal 1)"), + }, + } ++ testExcl := false + for _, testCase := range testCases { + t.Run(testCase.description, func(t *testing.T) { +- p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil) ++ p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil, testExcl) ++ + if !reflect.DeepEqual(err, testCase.expNewErr) { + t.Errorf("StaticPolicy Start() error (%v). expected error: %v but got: %v", + testCase.description, testCase.expNewErr, err) +@@ -930,7 +947,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + numReservedCPUs: 1, + reserved: cpuset.NewCPUSet(0), + stAssignments: state.ContainerCPUAssignments{}, +- stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7), ++ stDefaultCPUSet: cpuset.NewCPUSet(1, 2, 3, 4, 5, 6, 7), + pod: makePod("fakePod", "fakeContainer2", "8000m", "8000m"), + expErr: fmt.Errorf("not enough cpus available to satisfy request"), + expCPUAlloc: false, +@@ -942,7 +959,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + numReservedCPUs: 2, + reserved: cpuset.NewCPUSet(0, 1), + stAssignments: state.ContainerCPUAssignments{}, +- stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7), ++ stDefaultCPUSet: cpuset.NewCPUSet(2, 3, 4, 5, 6, 7), + pod: makePod("fakePod", "fakeContainer2", "1000m", "1000m"), + expErr: nil, + expCPUAlloc: true, +@@ -966,8 +983,9 @@ func TestStaticPolicyAddWithResvList(t *testing.T) { + }, + } + ++ testExcl := true + for _, testCase := range testCases { +- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil) ++ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil, testExcl) + + st := &mockState{ + assignments: testCase.stAssignments, +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/kubelet-sort-isolcpus-allocation-when-SMT-enabled.patch b/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/kubelet-sort-isolcpus-allocation-when-SMT-enabled.patch new file mode 100644 index 000000000..335a41b77 --- /dev/null +++ b/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/kubelet-sort-isolcpus-allocation-when-SMT-enabled.patch @@ -0,0 +1,50 @@ +From d4aa04d78e4a2692e93c1fa638dd624720a8504a Mon Sep 17 00:00:00 2001 +From: Jim Gauld +Date: Fri, 11 Feb 2022 11:06:35 -0500 +Subject: [PATCH 04/10] kubelet: sort isolcpus allocation when SMT enabled + +The existing device manager code returns CPUs as devices in unsorted +order. This numerically sorts isolcpus allocations when SMT/HT is +enabled on the host. This logs SMT pairs, singletons, and algorithm +order details to make the algorithm understandable. + +Signed-off-by: Jim Gauld +--- + pkg/kubelet/cm/devicemanager/manager.go | 13 ++++++++++++- + 1 file changed, 12 insertions(+), 1 deletion(-) + +diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go +index 191861d9e4a..4c897f0e032 100644 +--- a/pkg/kubelet/cm/devicemanager/manager.go ++++ b/pkg/kubelet/cm/devicemanager/manager.go +@@ -545,7 +545,16 @@ func order_devices_by_sibling(devices sets.String, needed int) ([]string, error) + return cpu_lst[0] + } + } ++ //Make post-analysis of selection algorithm obvious by numerical sorting ++ //the available isolated cpu_id. ++ cpu_ids := make([]int, 0, int(devices.Len())) + for cpu_id := range devices { ++ cpu_id_, _ := strconv.Atoi(cpu_id) ++ cpu_ids = append(cpu_ids, cpu_id_) ++ } ++ sort.Ints(cpu_ids) ++ for _, _cpu_id := range cpu_ids { ++ cpu_id := strconv.Itoa(_cpu_id) + // If we've already found cpu_id as a sibling, skip it. + if _, ok := _iterated_cpu[cpu_id]; ok { + continue +@@ -587,7 +596,9 @@ func order_devices_by_sibling(devices sets.String, needed int) ([]string, error) + } + } + } +- //klog.Infof("needed=%d ordered_cpu_list=%v", needed, dev_lst) ++ //This algorithm will get some attention. Show minimal details. ++ klog.Infof("order_devices_by_sibling: needed=%d, smtpairs=%v, singletons=%v, order=%v", ++ needed, sibling_lst, single_lst, dev_lst) + return dev_lst, nil + } + func smt_enabled() bool { +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/kubernetes-make-isolcpus-allocation-SMT-aware.patch b/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/kubernetes-make-isolcpus-allocation-SMT-aware.patch new file mode 100644 index 000000000..8308a82c3 --- /dev/null +++ b/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/kubernetes-make-isolcpus-allocation-SMT-aware.patch @@ -0,0 +1,151 @@ +From c054164c9012eb9dbe7a36775c9623fefb914996 Mon Sep 17 00:00:00 2001 +From: Ramesh Kumar Sivanandam +Date: Tue, 25 Oct 2022 11:13:35 -0400 +Subject: [PATCH 03/10] kubernetes: make isolcpus allocation SMT-aware + +Enhance isolcpus support in Kubernetes to allocate isolated SMT +siblings to the same container when SMT/HT is enabled on the host. + +As it stands, the device manager code in Kubernetes is not SMT-aware +(since normally it doesn't deal with CPUs). However, StarlingX +exposes isolated CPUs as devices and if possible we want to allocate +all SMT siblings from a CPU core to the same container in order to +minimize cross- container interference due to resource contention +within the CPU core. + +The solution is basically to take the list of isolated CPUs and +re-order it so that the SMT siblings are next to each other. That +way the existing resource selection code will allocate the siblings +together. As an optimization, if it is known that an odd number +of isolated CPUs are desired, a singleton SMT sibling will be +inserted into the list to avoid breaking up sibling pairs. + +Signed-off-by: Tao Wang +Signed-off-by: Ramesh Kumar Sivanandam +--- + pkg/kubelet/cm/devicemanager/manager.go | 84 ++++++++++++++++++++++++- + 1 file changed, 83 insertions(+), 1 deletion(-) + +diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go +index 8cb57aa8190..191861d9e4a 100644 +--- a/pkg/kubelet/cm/devicemanager/manager.go ++++ b/pkg/kubelet/cm/devicemanager/manager.go +@@ -19,10 +19,13 @@ package devicemanager + import ( + "context" + "fmt" ++ "io/ioutil" + "os" + "path/filepath" + "runtime" + "sort" ++ "strconv" ++ "strings" + "sync" + "time" + +@@ -36,6 +39,7 @@ import ( + pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors" ++ "k8s.io/kubernetes/pkg/kubelet/cm/cpuset" + "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint" + plugin "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/plugin/v1beta1" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" +@@ -526,6 +530,75 @@ func (m *ManagerImpl) UpdateAllocatedDevices() { + m.allocatedDevices = m.podDevices.devices() + } + ++//Given a list of isolated CPUs in 'devices', and the number of desired CPUs in 'needed', ++//return an ordered list of isolated CPUs such that the first 'needed' CPUs in the list ++//contain as many hyperthread sibling pairs as possible. ++func order_devices_by_sibling(devices sets.String, needed int) ([]string, error) { ++ var dev_lst []string ++ var single_lst []string ++ sibling_lst := make([]string, 0, int(devices.Len())) ++ _iterated_cpu := make(map[string]string) ++ get_sibling := func(cpu string, cpu_lst []string) string { ++ if cpu_lst[0] == cpu { ++ return cpu_lst[1] ++ } else { ++ return cpu_lst[0] ++ } ++ } ++ for cpu_id := range devices { ++ // If we've already found cpu_id as a sibling, skip it. ++ if _, ok := _iterated_cpu[cpu_id]; ok { ++ continue ++ } ++ devPath := fmt.Sprintf("/sys/devices/system/cpu/cpu%s/topology/thread_siblings_list", cpu_id) ++ dat, err := ioutil.ReadFile(devPath) ++ if err != nil { ++ return dev_lst, fmt.Errorf("Can't read cpu[%s] thread_siblings_list", cpu_id) ++ } ++ cpustring := strings.TrimSuffix(string(dat), "\n") ++ cpu_pair_set, err := cpuset.Parse(cpustring) ++ if err != nil { ++ return dev_lst, fmt.Errorf("Unable to parse thread_siblings_list[%s] string to cpuset", cpustring) ++ } ++ var cpu_pair_lst []string ++ for _, v := range cpu_pair_set.ToSlice() { ++ cpu_pair_lst = append(cpu_pair_lst, strconv.Itoa(v)) ++ } ++ sibling_cpu_id := get_sibling(cpu_id, cpu_pair_lst) ++ if _, ok := devices[sibling_cpu_id]; ok { ++ sibling_lst = append(sibling_lst, cpu_id, sibling_cpu_id) ++ _iterated_cpu[sibling_cpu_id] = "" ++ } else { ++ single_lst = append(single_lst, cpu_id) ++ } ++ _iterated_cpu[cpu_id] = "" ++ } ++ if needed%2 == 0 { ++ dev_lst = append(sibling_lst, single_lst...) ++ } else { ++ if len(single_lst) > 1 { ++ _tmp_list := append(sibling_lst, single_lst[1:]...) ++ dev_lst = append(single_lst[0:1], _tmp_list...) ++ } else { ++ if len(single_lst) == 0 { ++ dev_lst = sibling_lst ++ } else { ++ dev_lst = append(single_lst, sibling_lst...) ++ } ++ } ++ } ++ //klog.Infof("needed=%d ordered_cpu_list=%v", needed, dev_lst) ++ return dev_lst, nil ++} ++func smt_enabled() bool { ++ dat, _ := ioutil.ReadFile("/sys/devices/system/cpu/smt/active") ++ state := strings.TrimSuffix(string(dat), "\n") ++ if state == "0" { ++ return false ++ } ++ return true ++} ++ + // Returns list of device Ids we need to allocate with Allocate rpc call. + // Returns empty list in case we don't need to issue the Allocate rpc call. + func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, required int, reusableDevices sets.String) (sets.String, error) { +@@ -561,7 +634,16 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi + // Create a closure to help with device allocation + // Returns 'true' once no more devices need to be allocated. + allocateRemainingFrom := func(devices sets.String) bool { +- for device := range devices.Difference(allocated) { ++ availableDevices := devices.Difference(allocated).List() ++ // If we're dealing with isolcpus and SMT is enabled, reorder to group SMT siblings together. ++ if resource == "windriver.com/isolcpus" && len(devices) > 0 && smt_enabled() { ++ var err error ++ availableDevices, err = order_devices_by_sibling(devices.Difference(allocated), needed) ++ if err != nil { ++ klog.Errorf("error in order_devices_by_sibling: %v", err) ++ } ++ } ++ for _, device := range availableDevices { + m.allocatedDevices[resource].Insert(device) + allocated.Insert(device) + needed-- +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/series b/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/series new file mode 100644 index 000000000..1aa14b3be --- /dev/null +++ b/kubernetes/kubernetes-1.26.1/debian/deb_folder/patches/series @@ -0,0 +1,10 @@ +kubeadm-create-platform-pods-with-zero-CPU-resources.patch +Revert-use-subpath-for-coredns-only-for-default-repo.patch +kubernetes-make-isolcpus-allocation-SMT-aware.patch +kubelet-sort-isolcpus-allocation-when-SMT-enabled.patch +kubelet-cpumanager-disable-CFS-quota-throttling.patch +kubelet-cpumanager-keep-normal-containers-off-reserv.patch +kubelet-cpumanager-infra-pods-use-system-reserved-CP.patch +kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch +enable-support-for-kubernetes-to-ignore-isolcpus.patch +kubelet-CFS-quota-throttling-for-non-integer-cpulimit.patch