bc28897dc5
This change ports the following kubernetes 1.28.4 patches which were refactored slightly to allow for upstream changes The following patches were applied cleanly: kubelet-sort-isolcpus-allocation-when-SMT-enabled.patch kubelet-cpumanager-disable-CFS-quota-throttling.patch kubelet-cpumanager-keep-normal-containers-off-reserv.patch kubelet-cpumanager-infra-pods-use-system-reserved-CP.patch Affinity-of-guaranteed-pod-to-non-isolated-CPUs.patch kubelet-CFS-quota-throttling-for-non-integer-cpulimit.patch The following patches were refactored: kubeadm-create-platform-pods-with-zero-CPU-resources.patch kubernetes-make-isolcpus-allocation-SMT-aware.patch kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch enable-support-for-kubernetes-to-ignore-isolcpus.patch Note: Revert-use-subpath-for-coredns-only-for-default-repo.patch is removed as this change that updates the dns imageRepository is taken care in ansible playbook https://review.opendev.org/c/starlingx/ansible-playbooks/+/903499/1/playbookconfig/src/playbooks/roles/common/files/kubeadm.yaml.j2 Test Plan: PASS: Kubernetes package 1.28.4 builds properly. PASS: Run all Kubelet, kubeadm, kubectl make tests for affected code. PASS: build-iso successful with multiple kubernetes versions PASS: Install iso with k8s 1.28 default and test all patches. Story: 2010878 Task: 49209 Change-Id: I7693ad2fcc93d146eeae882d44f83b60589565db Signed-off-by: Saba Touheed Mujawar <sabatouheed.mujawar@windriver.com>
149 lines
5.8 KiB
Diff
149 lines
5.8 KiB
Diff
From 2ae8d69bc49fcbf6fe95209fef0e256b10ad7a0f Mon Sep 17 00:00:00 2001
|
|
From: Saba Touheed Mujawar <sabatouheed.mujawar@windriver.com>
|
|
Date: Fri, 1 Dec 2023 05:27:16 -0500
|
|
Subject: [PATCH] kubernetes: make isolcpus allocation SMT-aware
|
|
|
|
Enhance isolcpus support in Kubernetes to allocate isolated SMT
|
|
siblings to the same container when SMT/HT is enabled on the host.
|
|
|
|
As it stands, the device manager code in Kubernetes is not SMT-aware
|
|
(since normally it doesn't deal with CPUs). However, StarlingX
|
|
exposes isolated CPUs as devices and if possible we want to allocate
|
|
all SMT siblings from a CPU core to the same container in order to
|
|
minimize cross- container interference due to resource contention
|
|
within the CPU core.
|
|
|
|
The solution is basically to take the list of isolated CPUs and
|
|
re-order it so that the SMT siblings are next to each other. That
|
|
way the existing resource selection code will allocate the siblings
|
|
together. As an optimization, if it is known that an odd number
|
|
of isolated CPUs are desired, a singleton SMT sibling will be
|
|
inserted into the list to avoid breaking up sibling pairs.
|
|
|
|
Signed-off-by: Tao Wang <tao.wang@windriver.com>
|
|
Signed-off-by: Ramesh Kumar Sivanandam <rameshkumar.sivanandam@windriver.com>
|
|
Signed-off-by: Boovan Rajendran <boovan.rajendran@windriver.com>
|
|
Signed-off-by: Saba Touheed Mujawar <sabatouheed.mujawar@windriver.com>
|
|
---
|
|
pkg/kubelet/cm/devicemanager/manager.go | 83 ++++++++++++++++++++++++-
|
|
1 file changed, 82 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go
|
|
index d780ee801bd..a9966290ab5 100644
|
|
--- a/pkg/kubelet/cm/devicemanager/manager.go
|
|
+++ b/pkg/kubelet/cm/devicemanager/manager.go
|
|
@@ -23,6 +23,8 @@ import (
|
|
"path/filepath"
|
|
"runtime"
|
|
"sort"
|
|
+ "strconv"
|
|
+ "strings"
|
|
"sync"
|
|
"time"
|
|
|
|
@@ -36,6 +38,7 @@ import (
|
|
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
|
|
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
|
|
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
|
|
+ "k8s.io/utils/cpuset"
|
|
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
|
|
"k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint"
|
|
plugin "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/plugin/v1beta1"
|
|
@@ -542,6 +545,75 @@ func (m *ManagerImpl) UpdateAllocatedDevices() {
|
|
m.allocatedDevices = m.podDevices.devices()
|
|
}
|
|
|
|
+//Given a list of isolated CPUs in 'devices', and the number of desired CPUs in 'needed',
|
|
+//return an ordered list of isolated CPUs such that the first 'needed' CPUs in the list
|
|
+//contain as many hyperthread sibling pairs as possible.
|
|
+func order_devices_by_sibling(devices sets.String, needed int) ([]string, error) {
|
|
+ var dev_lst []string
|
|
+ var single_lst []string
|
|
+ sibling_lst := make([]string, 0, int(devices.Len()))
|
|
+ _iterated_cpu := make(map[string]string)
|
|
+ get_sibling := func(cpu string, cpu_lst []string) string {
|
|
+ if cpu_lst[0] == cpu {
|
|
+ return cpu_lst[1]
|
|
+ } else {
|
|
+ return cpu_lst[0]
|
|
+ }
|
|
+ }
|
|
+ for cpu_id := range devices {
|
|
+ // If we've already found cpu_id as a sibling, skip it.
|
|
+ if _, ok := _iterated_cpu[cpu_id]; ok {
|
|
+ continue
|
|
+ }
|
|
+ devPath := fmt.Sprintf("/sys/devices/system/cpu/cpu%s/topology/thread_siblings_list", cpu_id)
|
|
+ dat, err := os.ReadFile(devPath)
|
|
+ if err != nil {
|
|
+ return dev_lst, fmt.Errorf("Can't read cpu[%s] thread_siblings_list", cpu_id)
|
|
+ }
|
|
+ cpustring := strings.TrimSuffix(string(dat), "\n")
|
|
+ cpu_pair_set, err := cpuset.Parse(cpustring)
|
|
+ if err != nil {
|
|
+ return dev_lst, fmt.Errorf("Unable to parse thread_siblings_list[%s] string to cpuset", cpustring)
|
|
+ }
|
|
+ var cpu_pair_lst []string
|
|
+ for _, v := range cpu_pair_set.List() {
|
|
+ cpu_pair_lst = append(cpu_pair_lst, strconv.Itoa(v))
|
|
+ }
|
|
+ sibling_cpu_id := get_sibling(cpu_id, cpu_pair_lst)
|
|
+ if _, ok := devices[sibling_cpu_id]; ok {
|
|
+ sibling_lst = append(sibling_lst, cpu_id, sibling_cpu_id)
|
|
+ _iterated_cpu[sibling_cpu_id] = ""
|
|
+ } else {
|
|
+ single_lst = append(single_lst, cpu_id)
|
|
+ }
|
|
+ _iterated_cpu[cpu_id] = ""
|
|
+ }
|
|
+ if needed%2 == 0 {
|
|
+ dev_lst = append(sibling_lst, single_lst...)
|
|
+ } else {
|
|
+ if len(single_lst) > 1 {
|
|
+ _tmp_list := append(sibling_lst, single_lst[1:]...)
|
|
+ dev_lst = append(single_lst[0:1], _tmp_list...)
|
|
+ } else {
|
|
+ if len(single_lst) == 0 {
|
|
+ dev_lst = sibling_lst
|
|
+ } else {
|
|
+ dev_lst = append(single_lst, sibling_lst...)
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ //klog.Infof("needed=%d ordered_cpu_list=%v", needed, dev_lst)
|
|
+ return dev_lst, nil
|
|
+}
|
|
+func smt_enabled() bool {
|
|
+ dat, _ := os.ReadFile("/sys/devices/system/cpu/smt/active")
|
|
+ state := strings.TrimSuffix(string(dat), "\n")
|
|
+ if state == "0" {
|
|
+ return false
|
|
+ }
|
|
+ return true
|
|
+}
|
|
+
|
|
// Returns list of device Ids we need to allocate with Allocate rpc call.
|
|
// Returns empty list in case we don't need to issue the Allocate rpc call.
|
|
func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, required int, reusableDevices sets.String) (sets.String, error) {
|
|
@@ -615,7 +687,16 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
|
|
// Create a closure to help with device allocation
|
|
// Returns 'true' once no more devices need to be allocated.
|
|
allocateRemainingFrom := func(devices sets.String) bool {
|
|
- for device := range devices.Difference(allocated) {
|
|
+ availableDevices := devices.Difference(allocated).List()
|
|
+ // If we're dealing with isolcpus and SMT is enabled, reorder to group SMT siblings together.
|
|
+ if resource == "windriver.com/isolcpus" && len(devices) > 0 && smt_enabled() {
|
|
+ var err error
|
|
+ availableDevices, err = order_devices_by_sibling(devices.Difference(allocated), needed)
|
|
+ if err != nil {
|
|
+ klog.Errorf("error in order_devices_by_sibling: %v", err)
|
|
+ }
|
|
+ }
|
|
+ for _, device := range availableDevices {
|
|
m.allocatedDevices[resource].Insert(device)
|
|
allocated.Insert(device)
|
|
needed--
|
|
--
|
|
2.25.1
|
|
|