diff --git a/kubernetes/kubernetes-1.18.1/centos/files/kubernetes-make-isolcpus-allocation-SMT-aware.patch b/kubernetes/kubernetes-1.18.1/centos/files/kubernetes-make-isolcpus-allocation-SMT-aware.patch new file mode 100644 index 000000000..03e0a9497 --- /dev/null +++ b/kubernetes/kubernetes-1.18.1/centos/files/kubernetes-make-isolcpus-allocation-SMT-aware.patch @@ -0,0 +1,240 @@ +From 6bf9795d5e0dfc705299381dc902b22d03ded063 Mon Sep 17 00:00:00 2001 +From: Tao Wang +Date: Tue, 25 Jan 2022 19:23:43 -0500 +Subject: [PATCH] kubernetes: make isolcpus allocation SMT-aware + +Enhance isolcpus support in Kubernetes to allocate isolated SMT +siblings to the same container when SMT/HT is enabled on the host. + +As it stands, the device manager code in Kubernetes is not SMT-aware +(since normally it doesn't deal with CPUs). However, StarlingX +exposes isolated CPUs as devices and if possible we want to allocate +all SMT siblings from a CPU core to the same container in order to +minimize cross- container interference due to resource contention +within the CPU core. + +The solution is basically to take the list of isolated CPUs and +re-order it so that the SMT siblings are next to each other. That +way the existing resource selection code will allocate the siblings +together. As an optimization, if it is known that an odd number +of isolated CPUs are desired, a singleton SMT sibling will be +inserted into the list to avoid breaking up sibling pairs. + +Signed-off-by: Tao Wang +--- + pkg/kubelet/cm/devicemanager/manager.go | 153 ++++++++++++++++++++++-- + 1 file changed, 146 insertions(+), 7 deletions(-) + +diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go +index 65b91393..76a0ea6e 100644 +--- a/pkg/kubelet/cm/devicemanager/manager.go ++++ b/pkg/kubelet/cm/devicemanager/manager.go +@@ -19,10 +19,13 @@ package devicemanager + import ( + "context" + "fmt" ++ "io/ioutil" + "net" + "os" + "path/filepath" + "sort" ++ "strconv" ++ "strings" + "sync" + "time" + +@@ -41,6 +44,7 @@ import ( + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors" + cputopology "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology" ++ "k8s.io/kubernetes/pkg/kubelet/cm/cpuset" + "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask" +@@ -635,6 +639,80 @@ func (m *ManagerImpl) UpdateAllocatedDevices() { + m.allocatedDevices = m.podDevices.devices() + } + ++//Given a list of isolated CPUs in 'devices', and the number of desired CPUs in 'needed', ++//return an ordered list of isolated CPUs such that the first 'needed' CPUs in the list ++//contain as many hyperthread sibling pairs as possible. ++func order_devices_by_sibling(devices sets.String, needed int) ([]string, error) { ++ var dev_lst []string ++ var single_lst []string ++ sibling_lst := make([]string, 0, int(devices.Len())) ++ _iterated_cpu := make(map[string]string) ++ ++ get_sibling := func(cpu string, cpu_lst []string) string { ++ if cpu_lst[0] == cpu { ++ return cpu_lst[1] ++ } else { ++ return cpu_lst[0] ++ } ++ } ++ for cpu_id := range devices { ++ // If we've already found cpu_id as a sibling, skip it. ++ if _, ok := _iterated_cpu[cpu_id]; ok { ++ continue ++ } ++ devPath := fmt.Sprintf("/sys/devices/system/cpu/cpu%s/topology/thread_siblings_list", cpu_id) ++ dat, err := ioutil.ReadFile(devPath) ++ if err != nil { ++ return dev_lst, fmt.Errorf("Can't read cpu[%s] thread_siblings_list", cpu_id) ++ } ++ cpustring := strings.TrimSuffix(string(dat), "\n") ++ cpu_pair_set, err := cpuset.Parse(cpustring) ++ if err != nil { ++ return dev_lst, fmt.Errorf("Unable to parse thread_siblings_list[%s] string to cpuset", cpustring) ++ } ++ var cpu_pair_lst []string ++ for _, v := range cpu_pair_set.ToSlice() { ++ cpu_pair_lst = append(cpu_pair_lst, strconv.Itoa(v)) ++ } ++ sibling_cpu_id := get_sibling(cpu_id, cpu_pair_lst) ++ ++ if _, ok := devices[sibling_cpu_id]; ok { ++ sibling_lst = append(sibling_lst, cpu_id, sibling_cpu_id) ++ _iterated_cpu[sibling_cpu_id] = "" ++ } else { ++ single_lst = append(single_lst, cpu_id) ++ } ++ _iterated_cpu[cpu_id] = "" ++ } ++ ++ if needed%2 == 0 { ++ dev_lst = append(sibling_lst, single_lst...) ++ } else { ++ if len(single_lst) > 1 { ++ _tmp_list := append(sibling_lst, single_lst[1:]...) ++ dev_lst = append(single_lst[0:1], _tmp_list...) ++ } else { ++ if len(single_lst) == 0 { ++ dev_lst = sibling_lst ++ } else { ++ dev_lst = append(single_lst, sibling_lst...) ++ } ++ ++ } ++ } ++ //klog.Infof("needed=%d ordered_cpu_list=%v", needed, dev_lst) ++ return dev_lst, nil ++} ++ ++func smt_enabled() bool { ++ dat, _ := ioutil.ReadFile("/sys/devices/system/cpu/smt/active") ++ state := strings.TrimSuffix(string(dat), "\n") ++ if state == "0" { ++ return false ++ } ++ return true ++} ++ + // Returns list of device Ids we need to allocate with Allocate rpc call. + // Returns empty list in case we don't need to issue the Allocate rpc call. + func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, required int, reusableDevices sets.String) (sets.String, error) { +@@ -664,13 +742,29 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi + } + devices = sets.NewString() + // Allocates from reusableDevices list first. +- for device := range reusableDevices { +- devices.Insert(device) +- needed-- +- if needed == 0 { +- return devices, nil ++ if resource == "windriver.com/isolcpus" && smt_enabled() { ++ _reusableDevices, err := order_devices_by_sibling(reusableDevices, needed) ++ if err != nil { ++ klog.Errorf("error in order_devices_by_sibling: %v", err) ++ } ++ // _reusableDevices is type of slice,So we need a unique loop to process here. ++ for _, device := range _reusableDevices { ++ devices.Insert(device) ++ needed-- ++ if needed == 0 { ++ return devices, nil ++ } ++ } ++ } else { ++ for device := range reusableDevices { ++ devices.Insert(device) ++ needed-- ++ if needed == 0 { ++ return devices, nil ++ } + } + } ++ + // Needs to allocate additional devices. + if m.allocatedDevices[resource] == nil { + m.allocatedDevices[resource] = sets.NewString() +@@ -682,13 +776,25 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi + if available.Len() < needed { + return nil, fmt.Errorf("requested number of devices unavailable for %s. Requested: %d, Available: %d", resource, needed, available.Len()) + } +- // By default, pull devices from the unsorted list of available devices. +- allocated := available.UnsortedList()[:needed] ++ + // If topology alignment is desired, update allocated to the set of devices + // with the best alignment. ++ var allocated []string + hint := m.topologyAffinityStore.GetAffinity(podUID, contName) + if m.deviceHasTopologyAlignment(resource) && hint.NUMANodeAffinity != nil { + allocated = m.takeByTopology(resource, available, hint.NUMANodeAffinity, needed) ++ } else { ++ if resource == "windriver.com/isolcpus" && smt_enabled() { ++ var err error ++ allocated, err = order_devices_by_sibling(available, needed) ++ allocated = allocated[:needed] ++ if err != nil { ++ klog.Errorf("error in order_devices_by_sibling: %v", err) ++ } ++ } else { ++ // By default, pull devices from the unsorted list of available devices. ++ allocated = available.UnsortedList()[:needed] ++ } + } + // Updates m.allocatedDevices with allocated devices to prevent them + // from being allocated to other pods/containers, given that we are +@@ -764,6 +870,39 @@ func (m *ManagerImpl) takeByTopology(resource string, available sets.String, aff + } + } + ++ //Add specific logic to process isolcpus resource. ++ //Try to not sabotage the original logical structure. ++ //Sort the original three lists by sibling: fromAffinity,notFromAffinity,withoutTopology ++ if resource == "windriver.com/isolcpus" && smt_enabled() { ++ var err error ++ _request_device_map := make(sets.String) ++ for _, dev := range fromAffinity { ++ _request_device_map[dev] = sets.Empty{} ++ } ++ fromAffinity, err = order_devices_by_sibling(_request_device_map, request) ++ if err != nil { ++ klog.Errorf("error in order_devices_by_sibling: %v", err) ++ } ++ ++ _request_device_map = make(sets.String) ++ for _, dev := range notFromAffinity { ++ _request_device_map[dev] = sets.Empty{} ++ } ++ notFromAffinity, err = order_devices_by_sibling(_request_device_map, request) ++ if err != nil { ++ klog.Errorf("error in order_devices_by_sibling: %v", err) ++ } ++ ++ _request_device_map = make(sets.String) ++ for _, dev := range withoutTopology { ++ _request_device_map[dev] = sets.Empty{} ++ } ++ withoutTopology, err = order_devices_by_sibling(_request_device_map, request) ++ if err != nil { ++ klog.Errorf("error in order_devices_by_sibling: %v", err) ++ } ++ } ++ + // Concatenate the lists above return the first 'request' devices from it.. + return append(append(fromAffinity, notFromAffinity...), withoutTopology...)[:request] + } +-- +2.22.5 + diff --git a/kubernetes/kubernetes-1.18.1/centos/kubernetes.spec b/kubernetes/kubernetes-1.18.1/centos/kubernetes.spec index c293c7214..40929d10b 100644 --- a/kubernetes/kubernetes-1.18.1/centos/kubernetes.spec +++ b/kubernetes/kubernetes-1.18.1/centos/kubernetes.spec @@ -62,6 +62,7 @@ Patch7: kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch Patch8: Fix-exclusive-CPU-allocations-being-deleted-at-conta.patch Patch9: kubeadm-create-platform-pods-with-zero-CPU-resources.patch Patch10: add-option-to-disable-isolcpu-awareness.patch +Patch11: kubernetes-make-isolcpus-allocation-SMT-aware.patch # It obsoletes cadvisor but needs its source code (literally integrated) Obsoletes: cadvisor @@ -858,6 +859,7 @@ install in production. %patch8 -p1 %patch9 -p1 %patch10 -p1 +%patch11 -p1 #src/k8s.io/kubernetes/pkg/util/certificates # Patch the code to remove eliptic.P224 support diff --git a/kubernetes/kubernetes-1.21.8/centos/files/kubernetes-make-isolcpus-allocation-SMT-aware.patch b/kubernetes/kubernetes-1.21.8/centos/files/kubernetes-make-isolcpus-allocation-SMT-aware.patch new file mode 100644 index 000000000..dbc28d503 --- /dev/null +++ b/kubernetes/kubernetes-1.21.8/centos/files/kubernetes-make-isolcpus-allocation-SMT-aware.patch @@ -0,0 +1,151 @@ +From 95b7b6e1ddb25511c67a3d4018f62df1e76ee7bc Mon Sep 17 00:00:00 2001 +From: Tao Wang +Date: Tue, 25 Jan 2022 19:25:45 -0500 +Subject: [PATCH] kubernetes: make isolcpus allocation SMT-aware + +Enhance isolcpus support in Kubernetes to allocate isolated SMT +siblings to the same container when SMT/HT is enabled on the host. + +As it stands, the device manager code in Kubernetes is not SMT-aware +(since normally it doesn't deal with CPUs). However, StarlingX +exposes isolated CPUs as devices and if possible we want to allocate +all SMT siblings from a CPU core to the same container in order to +minimize cross- container interference due to resource contention +within the CPU core. + +The solution is basically to take the list of isolated CPUs and +re-order it so that the SMT siblings are next to each other. That +way the existing resource selection code will allocate the siblings +together. As an optimization, if it is known that an odd number +of isolated CPUs are desired, a singleton SMT sibling will be +inserted into the list to avoid breaking up sibling pairs. + +Signed-off-by: Tao Wang +--- + pkg/kubelet/cm/devicemanager/manager.go | 84 ++++++++++++++++++++++++- + 1 file changed, 83 insertions(+), 1 deletion(-) + +diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go +index 60de14a9..609da8ed 100644 +--- a/pkg/kubelet/cm/devicemanager/manager.go ++++ b/pkg/kubelet/cm/devicemanager/manager.go +@@ -19,11 +19,14 @@ package devicemanager + import ( + "context" + "fmt" ++ "io/ioutil" + "net" + "os" + "path/filepath" + "runtime" + "sort" ++ "strconv" ++ "strings" + "sync" + "time" + +@@ -41,6 +44,7 @@ import ( + "k8s.io/kubernetes/pkg/features" + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors" ++ "k8s.io/kubernetes/pkg/kubelet/cm/cpuset" + "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" + "k8s.io/kubernetes/pkg/kubelet/config" +@@ -667,6 +671,75 @@ func (m *ManagerImpl) UpdateAllocatedDevices() { + m.allocatedDevices = m.podDevices.devices() + } + ++//Given a list of isolated CPUs in 'devices', and the number of desired CPUs in 'needed', ++//return an ordered list of isolated CPUs such that the first 'needed' CPUs in the list ++//contain as many hyperthread sibling pairs as possible. ++func order_devices_by_sibling(devices sets.String, needed int) ([]string, error) { ++ var dev_lst []string ++ var single_lst []string ++ sibling_lst := make([]string, 0, int(devices.Len())) ++ _iterated_cpu := make(map[string]string) ++ get_sibling := func(cpu string, cpu_lst []string) string { ++ if cpu_lst[0] == cpu { ++ return cpu_lst[1] ++ } else { ++ return cpu_lst[0] ++ } ++ } ++ for cpu_id := range devices { ++ // If we've already found cpu_id as a sibling, skip it. ++ if _, ok := _iterated_cpu[cpu_id]; ok { ++ continue ++ } ++ devPath := fmt.Sprintf("/sys/devices/system/cpu/cpu%s/topology/thread_siblings_list", cpu_id) ++ dat, err := ioutil.ReadFile(devPath) ++ if err != nil { ++ return dev_lst, fmt.Errorf("Can't read cpu[%s] thread_siblings_list", cpu_id) ++ } ++ cpustring := strings.TrimSuffix(string(dat), "\n") ++ cpu_pair_set, err := cpuset.Parse(cpustring) ++ if err != nil { ++ return dev_lst, fmt.Errorf("Unable to parse thread_siblings_list[%s] string to cpuset", cpustring) ++ } ++ var cpu_pair_lst []string ++ for _, v := range cpu_pair_set.ToSlice() { ++ cpu_pair_lst = append(cpu_pair_lst, strconv.Itoa(v)) ++ } ++ sibling_cpu_id := get_sibling(cpu_id, cpu_pair_lst) ++ if _, ok := devices[sibling_cpu_id]; ok { ++ sibling_lst = append(sibling_lst, cpu_id, sibling_cpu_id) ++ _iterated_cpu[sibling_cpu_id] = "" ++ } else { ++ single_lst = append(single_lst, cpu_id) ++ } ++ _iterated_cpu[cpu_id] = "" ++ } ++ if needed%2 == 0 { ++ dev_lst = append(sibling_lst, single_lst...) ++ } else { ++ if len(single_lst) > 1 { ++ _tmp_list := append(sibling_lst, single_lst[1:]...) ++ dev_lst = append(single_lst[0:1], _tmp_list...) ++ } else { ++ if len(single_lst) == 0 { ++ dev_lst = sibling_lst ++ } else { ++ dev_lst = append(single_lst, sibling_lst...) ++ } ++ } ++ } ++ //klog.Infof("needed=%d ordered_cpu_list=%v", needed, dev_lst) ++ return dev_lst, nil ++} ++func smt_enabled() bool { ++ dat, _ := ioutil.ReadFile("/sys/devices/system/cpu/smt/active") ++ state := strings.TrimSuffix(string(dat), "\n") ++ if state == "0" { ++ return false ++ } ++ return true ++} ++ + // Returns list of device Ids we need to allocate with Allocate rpc call. + // Returns empty list in case we don't need to issue the Allocate rpc call. + func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, required int, reusableDevices sets.String) (sets.String, error) { +@@ -702,7 +775,16 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi + // Create a closure to help with device allocation + // Returns 'true' once no more devices need to be allocated. + allocateRemainingFrom := func(devices sets.String) bool { +- for device := range devices.Difference(allocated) { ++ availableDevices := devices.Difference(allocated).List() ++ // If we're dealing with isolcpus and SMT is enabled, reorder to group SMT siblings together. ++ if resource == "windriver.com/isolcpus" && len(devices) > 0 && smt_enabled() { ++ var err error ++ availableDevices, err = order_devices_by_sibling(devices.Difference(allocated), needed) ++ if err != nil { ++ klog.Errorf("error in order_devices_by_sibling: %v", err) ++ } ++ } ++ for _, device := range availableDevices { + m.allocatedDevices[resource].Insert(device) + allocated.Insert(device) + needed-- +-- +2.22.5 + diff --git a/kubernetes/kubernetes-1.21.8/centos/kubernetes.spec b/kubernetes/kubernetes-1.21.8/centos/kubernetes.spec index b106ec723..a6c9f80f3 100644 --- a/kubernetes/kubernetes-1.21.8/centos/kubernetes.spec +++ b/kubernetes/kubernetes-1.21.8/centos/kubernetes.spec @@ -61,6 +61,7 @@ Patch5: kubeadm-create-platform-pods-with-zero-CPU-resources.patch Patch6: enable-support-for-kubernetes-to-ignore-isolcpus.patch Patch7: Revert-use-subpath-for-coredns-only-for-default-repo.patch Patch8: Change-log-level-to-Debug.patch +Patch9: kubernetes-make-isolcpus-allocation-SMT-aware.patch # It obsoletes cadvisor but needs its source code (literally integrated) Obsoletes: cadvisor @@ -856,6 +857,7 @@ install in production. %patch6 -p1 %patch7 -p1 %patch8 -p1 +%patch9 -p1 #src/k8s.io/kubernetes/pkg/util/certificates # Patch the code to remove eliptic.P224 support