From 2874abbeaa658bd53127f38d167754b20ac46474 Mon Sep 17 00:00:00 2001 From: Jim Gauld Date: Mon, 7 Mar 2022 15:03:24 -0500 Subject: [PATCH] Port kubelet make isolcpus allocation SMT awareness to 1.22.5 Simple port of two existing patches without modification to kubernetes 1.22.5. This enables the feature to make kubelet isolcpus allocation SMT aware. Story: 2008760 Task: 44190 Signed-off-by: Jim Gauld Change-Id: I6fb17f62d90bd6aa26b88dc7057ed7f667cf81b7 --- ...isolcpus-allocation-when-SMT-enabled.patch | 50 ++++++ ...s-make-isolcpus-allocation-SMT-aware.patch | 151 ++++++++++++++++++ .../kubernetes-1.22.5/centos/kubernetes.spec | 5 +- 3 files changed, 205 insertions(+), 1 deletion(-) create mode 100644 kubernetes/kubernetes-1.22.5/centos/files/kubelet-sort-isolcpus-allocation-when-SMT-enabled.patch create mode 100644 kubernetes/kubernetes-1.22.5/centos/files/kubernetes-make-isolcpus-allocation-SMT-aware.patch diff --git a/kubernetes/kubernetes-1.22.5/centos/files/kubelet-sort-isolcpus-allocation-when-SMT-enabled.patch b/kubernetes/kubernetes-1.22.5/centos/files/kubelet-sort-isolcpus-allocation-when-SMT-enabled.patch new file mode 100644 index 000000000..a58e47d5b --- /dev/null +++ b/kubernetes/kubernetes-1.22.5/centos/files/kubelet-sort-isolcpus-allocation-when-SMT-enabled.patch @@ -0,0 +1,50 @@ +From ba9ab333c8b7dca5252e604837914293dc232732 Mon Sep 17 00:00:00 2001 +From: Jim Gauld +Date: Fri, 11 Feb 2022 11:06:35 -0500 +Subject: [PATCH] kubelet: sort isolcpus allocation when SMT enabled + +The existing device manager code returns CPUs as devices in unsorted +order. This numerically sorts isolcpus allocations when SMT/HT is +enabled on the host. This logs SMT pairs, singletons, and algorithm +order details to make the algorithm understandable. + +Signed-off-by: Jim Gauld +--- + pkg/kubelet/cm/devicemanager/manager.go | 13 ++++++++++++- + 1 file changed, 12 insertions(+), 1 deletion(-) + +diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go +index 609da8ed86b..a4b247714f7 100644 +--- a/pkg/kubelet/cm/devicemanager/manager.go ++++ b/pkg/kubelet/cm/devicemanager/manager.go +@@ -686,7 +686,16 @@ func order_devices_by_sibling(devices sets.String, needed int) ([]string, error) + return cpu_lst[0] + } + } ++ //Make post-analysis of selection algorithm obvious by numerical sorting ++ //the available isolated cpu_id. ++ cpu_ids := make([]int, 0, int(devices.Len())) + for cpu_id := range devices { ++ cpu_id_, _ := strconv.Atoi(cpu_id) ++ cpu_ids = append(cpu_ids, cpu_id_) ++ } ++ sort.Ints(cpu_ids) ++ for _, _cpu_id := range cpu_ids { ++ cpu_id := strconv.Itoa(_cpu_id) + // If we've already found cpu_id as a sibling, skip it. + if _, ok := _iterated_cpu[cpu_id]; ok { + continue +@@ -728,7 +737,9 @@ func order_devices_by_sibling(devices sets.String, needed int) ([]string, error) + } + } + } +- //klog.Infof("needed=%d ordered_cpu_list=%v", needed, dev_lst) ++ //This algorithm will get some attention. Show minimal details. ++ klog.Infof("order_devices_by_sibling: needed=%d, smtpairs=%v, singletons=%v, order=%v", ++ needed, sibling_lst, single_lst, dev_lst) + return dev_lst, nil + } + func smt_enabled() bool { +-- +2.25.1 + diff --git a/kubernetes/kubernetes-1.22.5/centos/files/kubernetes-make-isolcpus-allocation-SMT-aware.patch b/kubernetes/kubernetes-1.22.5/centos/files/kubernetes-make-isolcpus-allocation-SMT-aware.patch new file mode 100644 index 000000000..dbc28d503 --- /dev/null +++ b/kubernetes/kubernetes-1.22.5/centos/files/kubernetes-make-isolcpus-allocation-SMT-aware.patch @@ -0,0 +1,151 @@ +From 95b7b6e1ddb25511c67a3d4018f62df1e76ee7bc Mon Sep 17 00:00:00 2001 +From: Tao Wang +Date: Tue, 25 Jan 2022 19:25:45 -0500 +Subject: [PATCH] kubernetes: make isolcpus allocation SMT-aware + +Enhance isolcpus support in Kubernetes to allocate isolated SMT +siblings to the same container when SMT/HT is enabled on the host. + +As it stands, the device manager code in Kubernetes is not SMT-aware +(since normally it doesn't deal with CPUs). However, StarlingX +exposes isolated CPUs as devices and if possible we want to allocate +all SMT siblings from a CPU core to the same container in order to +minimize cross- container interference due to resource contention +within the CPU core. + +The solution is basically to take the list of isolated CPUs and +re-order it so that the SMT siblings are next to each other. That +way the existing resource selection code will allocate the siblings +together. As an optimization, if it is known that an odd number +of isolated CPUs are desired, a singleton SMT sibling will be +inserted into the list to avoid breaking up sibling pairs. + +Signed-off-by: Tao Wang +--- + pkg/kubelet/cm/devicemanager/manager.go | 84 ++++++++++++++++++++++++- + 1 file changed, 83 insertions(+), 1 deletion(-) + +diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go +index 60de14a9..609da8ed 100644 +--- a/pkg/kubelet/cm/devicemanager/manager.go ++++ b/pkg/kubelet/cm/devicemanager/manager.go +@@ -19,11 +19,14 @@ package devicemanager + import ( + "context" + "fmt" ++ "io/ioutil" + "net" + "os" + "path/filepath" + "runtime" + "sort" ++ "strconv" ++ "strings" + "sync" + "time" + +@@ -41,6 +44,7 @@ import ( + "k8s.io/kubernetes/pkg/features" + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors" ++ "k8s.io/kubernetes/pkg/kubelet/cm/cpuset" + "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" + "k8s.io/kubernetes/pkg/kubelet/config" +@@ -667,6 +671,75 @@ func (m *ManagerImpl) UpdateAllocatedDevices() { + m.allocatedDevices = m.podDevices.devices() + } + ++//Given a list of isolated CPUs in 'devices', and the number of desired CPUs in 'needed', ++//return an ordered list of isolated CPUs such that the first 'needed' CPUs in the list ++//contain as many hyperthread sibling pairs as possible. ++func order_devices_by_sibling(devices sets.String, needed int) ([]string, error) { ++ var dev_lst []string ++ var single_lst []string ++ sibling_lst := make([]string, 0, int(devices.Len())) ++ _iterated_cpu := make(map[string]string) ++ get_sibling := func(cpu string, cpu_lst []string) string { ++ if cpu_lst[0] == cpu { ++ return cpu_lst[1] ++ } else { ++ return cpu_lst[0] ++ } ++ } ++ for cpu_id := range devices { ++ // If we've already found cpu_id as a sibling, skip it. ++ if _, ok := _iterated_cpu[cpu_id]; ok { ++ continue ++ } ++ devPath := fmt.Sprintf("/sys/devices/system/cpu/cpu%s/topology/thread_siblings_list", cpu_id) ++ dat, err := ioutil.ReadFile(devPath) ++ if err != nil { ++ return dev_lst, fmt.Errorf("Can't read cpu[%s] thread_siblings_list", cpu_id) ++ } ++ cpustring := strings.TrimSuffix(string(dat), "\n") ++ cpu_pair_set, err := cpuset.Parse(cpustring) ++ if err != nil { ++ return dev_lst, fmt.Errorf("Unable to parse thread_siblings_list[%s] string to cpuset", cpustring) ++ } ++ var cpu_pair_lst []string ++ for _, v := range cpu_pair_set.ToSlice() { ++ cpu_pair_lst = append(cpu_pair_lst, strconv.Itoa(v)) ++ } ++ sibling_cpu_id := get_sibling(cpu_id, cpu_pair_lst) ++ if _, ok := devices[sibling_cpu_id]; ok { ++ sibling_lst = append(sibling_lst, cpu_id, sibling_cpu_id) ++ _iterated_cpu[sibling_cpu_id] = "" ++ } else { ++ single_lst = append(single_lst, cpu_id) ++ } ++ _iterated_cpu[cpu_id] = "" ++ } ++ if needed%2 == 0 { ++ dev_lst = append(sibling_lst, single_lst...) ++ } else { ++ if len(single_lst) > 1 { ++ _tmp_list := append(sibling_lst, single_lst[1:]...) ++ dev_lst = append(single_lst[0:1], _tmp_list...) ++ } else { ++ if len(single_lst) == 0 { ++ dev_lst = sibling_lst ++ } else { ++ dev_lst = append(single_lst, sibling_lst...) ++ } ++ } ++ } ++ //klog.Infof("needed=%d ordered_cpu_list=%v", needed, dev_lst) ++ return dev_lst, nil ++} ++func smt_enabled() bool { ++ dat, _ := ioutil.ReadFile("/sys/devices/system/cpu/smt/active") ++ state := strings.TrimSuffix(string(dat), "\n") ++ if state == "0" { ++ return false ++ } ++ return true ++} ++ + // Returns list of device Ids we need to allocate with Allocate rpc call. + // Returns empty list in case we don't need to issue the Allocate rpc call. + func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, required int, reusableDevices sets.String) (sets.String, error) { +@@ -702,7 +775,16 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi + // Create a closure to help with device allocation + // Returns 'true' once no more devices need to be allocated. + allocateRemainingFrom := func(devices sets.String) bool { +- for device := range devices.Difference(allocated) { ++ availableDevices := devices.Difference(allocated).List() ++ // If we're dealing with isolcpus and SMT is enabled, reorder to group SMT siblings together. ++ if resource == "windriver.com/isolcpus" && len(devices) > 0 && smt_enabled() { ++ var err error ++ availableDevices, err = order_devices_by_sibling(devices.Difference(allocated), needed) ++ if err != nil { ++ klog.Errorf("error in order_devices_by_sibling: %v", err) ++ } ++ } ++ for _, device := range availableDevices { + m.allocatedDevices[resource].Insert(device) + allocated.Insert(device) + needed-- +-- +2.22.5 + diff --git a/kubernetes/kubernetes-1.22.5/centos/kubernetes.spec b/kubernetes/kubernetes-1.22.5/centos/kubernetes.spec index 10e44463b..35920b605 100644 --- a/kubernetes/kubernetes-1.22.5/centos/kubernetes.spec +++ b/kubernetes/kubernetes-1.22.5/centos/kubernetes.spec @@ -58,7 +58,8 @@ Patch4: kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch Patch5: kubeadm-create-platform-pods-with-zero-CPU-resources.patch Patch6: enable-support-for-kubernetes-to-ignore-isolcpus.patch Patch7: Revert-use-subpath-for-coredns-only-for-default-repo.patch - +Patch8: kubernetes-make-isolcpus-allocation-SMT-aware.patch +Patch9: kubelet-sort-isolcpus-allocation-when-SMT-enabled.patch # It obsoletes cadvisor but needs its source code (literally integrated) @@ -854,6 +855,8 @@ install in production. %patch5 -p1 %patch6 -p1 %patch7 -p1 +%patch8 -p1 +%patch9 -p1 #src/k8s.io/kubernetes/pkg/util/certificates