integ/kubernetes/kubernetes-1.28.4/debian/deb_folder/patches/kubernetes-make-isolcpus-allocation-SMT-aware.patch
Saba Touheed Mujawar bc28897dc5 Add kubernetes 1.28.4 patches
This change ports the following kubernetes 1.28.4 patches which were
refactored slightly to allow for upstream changes

The following patches were applied cleanly:
kubelet-sort-isolcpus-allocation-when-SMT-enabled.patch
kubelet-cpumanager-disable-CFS-quota-throttling.patch
kubelet-cpumanager-keep-normal-containers-off-reserv.patch
kubelet-cpumanager-infra-pods-use-system-reserved-CP.patch
Affinity-of-guaranteed-pod-to-non-isolated-CPUs.patch
kubelet-CFS-quota-throttling-for-non-integer-cpulimit.patch

The following patches were refactored:
kubeadm-create-platform-pods-with-zero-CPU-resources.patch
kubernetes-make-isolcpus-allocation-SMT-aware.patch
kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch
enable-support-for-kubernetes-to-ignore-isolcpus.patch

Note: Revert-use-subpath-for-coredns-only-for-default-repo.patch
      is removed as this change that updates the dns
      imageRepository is taken care in ansible playbook
      https://review.opendev.org/c/starlingx/ansible-playbooks/+/903499/1/playbookconfig/src/playbooks/roles/common/files/kubeadm.yaml.j2

Test Plan:
PASS: Kubernetes package 1.28.4 builds properly.
PASS: Run all Kubelet, kubeadm, kubectl make tests for affected code.
PASS: build-iso successful with multiple kubernetes versions
PASS: Install iso with k8s 1.28 default and test all patches.

Story: 2010878
Task: 49209

Change-Id: I7693ad2fcc93d146eeae882d44f83b60589565db
Signed-off-by: Saba Touheed Mujawar <sabatouheed.mujawar@windriver.com>
2023-12-18 01:38:21 -05:00

149 lines
5.8 KiB
Diff

From 2ae8d69bc49fcbf6fe95209fef0e256b10ad7a0f Mon Sep 17 00:00:00 2001
From: Saba Touheed Mujawar <sabatouheed.mujawar@windriver.com>
Date: Fri, 1 Dec 2023 05:27:16 -0500
Subject: [PATCH] kubernetes: make isolcpus allocation SMT-aware
Enhance isolcpus support in Kubernetes to allocate isolated SMT
siblings to the same container when SMT/HT is enabled on the host.
As it stands, the device manager code in Kubernetes is not SMT-aware
(since normally it doesn't deal with CPUs). However, StarlingX
exposes isolated CPUs as devices and if possible we want to allocate
all SMT siblings from a CPU core to the same container in order to
minimize cross- container interference due to resource contention
within the CPU core.
The solution is basically to take the list of isolated CPUs and
re-order it so that the SMT siblings are next to each other. That
way the existing resource selection code will allocate the siblings
together. As an optimization, if it is known that an odd number
of isolated CPUs are desired, a singleton SMT sibling will be
inserted into the list to avoid breaking up sibling pairs.
Signed-off-by: Tao Wang <tao.wang@windriver.com>
Signed-off-by: Ramesh Kumar Sivanandam <rameshkumar.sivanandam@windriver.com>
Signed-off-by: Boovan Rajendran <boovan.rajendran@windriver.com>
Signed-off-by: Saba Touheed Mujawar <sabatouheed.mujawar@windriver.com>
---
pkg/kubelet/cm/devicemanager/manager.go | 83 ++++++++++++++++++++++++-
1 file changed, 82 insertions(+), 1 deletion(-)
diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go
index d780ee801bd..a9966290ab5 100644
--- a/pkg/kubelet/cm/devicemanager/manager.go
+++ b/pkg/kubelet/cm/devicemanager/manager.go
@@ -23,6 +23,8 @@ import (
"path/filepath"
"runtime"
"sort"
+ "strconv"
+ "strings"
"sync"
"time"
@@ -36,6 +38,7 @@ import (
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
+ "k8s.io/utils/cpuset"
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
"k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint"
plugin "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/plugin/v1beta1"
@@ -542,6 +545,75 @@ func (m *ManagerImpl) UpdateAllocatedDevices() {
m.allocatedDevices = m.podDevices.devices()
}
+//Given a list of isolated CPUs in 'devices', and the number of desired CPUs in 'needed',
+//return an ordered list of isolated CPUs such that the first 'needed' CPUs in the list
+//contain as many hyperthread sibling pairs as possible.
+func order_devices_by_sibling(devices sets.String, needed int) ([]string, error) {
+ var dev_lst []string
+ var single_lst []string
+ sibling_lst := make([]string, 0, int(devices.Len()))
+ _iterated_cpu := make(map[string]string)
+ get_sibling := func(cpu string, cpu_lst []string) string {
+ if cpu_lst[0] == cpu {
+ return cpu_lst[1]
+ } else {
+ return cpu_lst[0]
+ }
+ }
+ for cpu_id := range devices {
+ // If we've already found cpu_id as a sibling, skip it.
+ if _, ok := _iterated_cpu[cpu_id]; ok {
+ continue
+ }
+ devPath := fmt.Sprintf("/sys/devices/system/cpu/cpu%s/topology/thread_siblings_list", cpu_id)
+ dat, err := os.ReadFile(devPath)
+ if err != nil {
+ return dev_lst, fmt.Errorf("Can't read cpu[%s] thread_siblings_list", cpu_id)
+ }
+ cpustring := strings.TrimSuffix(string(dat), "\n")
+ cpu_pair_set, err := cpuset.Parse(cpustring)
+ if err != nil {
+ return dev_lst, fmt.Errorf("Unable to parse thread_siblings_list[%s] string to cpuset", cpustring)
+ }
+ var cpu_pair_lst []string
+ for _, v := range cpu_pair_set.List() {
+ cpu_pair_lst = append(cpu_pair_lst, strconv.Itoa(v))
+ }
+ sibling_cpu_id := get_sibling(cpu_id, cpu_pair_lst)
+ if _, ok := devices[sibling_cpu_id]; ok {
+ sibling_lst = append(sibling_lst, cpu_id, sibling_cpu_id)
+ _iterated_cpu[sibling_cpu_id] = ""
+ } else {
+ single_lst = append(single_lst, cpu_id)
+ }
+ _iterated_cpu[cpu_id] = ""
+ }
+ if needed%2 == 0 {
+ dev_lst = append(sibling_lst, single_lst...)
+ } else {
+ if len(single_lst) > 1 {
+ _tmp_list := append(sibling_lst, single_lst[1:]...)
+ dev_lst = append(single_lst[0:1], _tmp_list...)
+ } else {
+ if len(single_lst) == 0 {
+ dev_lst = sibling_lst
+ } else {
+ dev_lst = append(single_lst, sibling_lst...)
+ }
+ }
+ }
+ //klog.Infof("needed=%d ordered_cpu_list=%v", needed, dev_lst)
+ return dev_lst, nil
+}
+func smt_enabled() bool {
+ dat, _ := os.ReadFile("/sys/devices/system/cpu/smt/active")
+ state := strings.TrimSuffix(string(dat), "\n")
+ if state == "0" {
+ return false
+ }
+ return true
+}
+
// Returns list of device Ids we need to allocate with Allocate rpc call.
// Returns empty list in case we don't need to issue the Allocate rpc call.
func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, required int, reusableDevices sets.String) (sets.String, error) {
@@ -615,7 +687,16 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
// Create a closure to help with device allocation
// Returns 'true' once no more devices need to be allocated.
allocateRemainingFrom := func(devices sets.String) bool {
- for device := range devices.Difference(allocated) {
+ availableDevices := devices.Difference(allocated).List()
+ // If we're dealing with isolcpus and SMT is enabled, reorder to group SMT siblings together.
+ if resource == "windriver.com/isolcpus" && len(devices) > 0 && smt_enabled() {
+ var err error
+ availableDevices, err = order_devices_by_sibling(devices.Difference(allocated), needed)
+ if err != nil {
+ klog.Errorf("error in order_devices_by_sibling: %v", err)
+ }
+ }
+ for _, device := range availableDevices {
m.allocatedDevices[resource].Insert(device)
allocated.Insert(device)
needed--
--
2.25.1