kubernetes: make isolcpus allocation SMT-aware

Enhance isolcpus support in Kubernetes to allocate isolated SMT
siblings to the same container when SMT/HT is enabled on the host.

As it stands, the device manager code in Kubernetes is not SMT-aware
(since normally it doesn't deal with CPUs).  However, StarlingX
exposes isolated CPUs as devices and if possible we want to allocate
all SMT siblings from a CPU core to the same container in order to
minimize cross- container interference due to resource contention
within the CPU core.

The solution is basically to take the list of isolated CPUs and
re-order it so that the SMT siblings are next to each other.  That
way the existing resource selection code will allocate the siblings
together.  As an optimization, if it is known that an odd number
of isolated CPUs are desired, a singleton SMT sibling will be
inserted into the list to avoid breaking up sibling pairs.

Test Plan:

Tested with AIO-SX HT enabled and disabled (disabled will not
run the code).  Test results below are with HT enabled.

Platform cpu layout:

       Socket 0        Socket 1
       --------        --------
Core 0 [0, 16]         [8, 24]
Core 1 [1, 17]         [9, 25]
Core 2 [2, 18]         [10, 26]
Core 3 [3, 19]         [11, 27]
Core 4 [4, 20]         [12, 28]
Core 5 [5, 21]         [13, 29]
Core 6 [6, 22]         [14, 30]
Core 7 [7, 23]         [15, 31]

isolcpus=2-3,8-9,18-19,24-25
Ct: container
InCt: initcontainer
U: isolcpus
Test command in pod:cat /sys/fs/cgroup/cpuset/cpuset.cpus

Case 1: 1Ct_3U,got [2-3,19],Passed
Case 2: Keep case 1;create 1Ct_2U,got [9,25],Passed
Case 3: Keep case 1,2;create 1Ct_1U got[18],
        create another 1Ct_2U got [8,24],Passed
Case 4: Reboot after case 3;Pods keep cpu as above.Passed
Case 5: Clean All;create 2Ct_3U by one replicaset,pod_1[8-9,24],
        pod_2[3,18-19],Passed
Case 6: Keep case 5;create 1Ct_2U,got [2,25],[2,25] is the last
        two non-siblings isocpus,as expected,Passed
Case 7: Clean All;create 2InCt_1Ct_2U,InCt_1 got[2,18],
        InCt_2 got[2,18],Pod got [2,18],Passed
Case 8: Clean All;create 2InCt_2Ct_2U, Pod1_InCt_1 got[3,19],
        Pod1_InCt_2 got[3,19], Pod1 got [3,19],Pod2_InCt_1 got[2,18],
        Pod2_InCt_2 got[2,18], Pod2 got [2,18],Cpu in initcontainer
        is tested by write file to pvc,Passed

Story: 2008760
Task: 44190

Change-Id: I8bd03352cc395bada9126fb0fce8ed268ac36456
Signed-off-by: Tao Wang <tao.wang@windriver.com>
Signed-off-by: Chris Friesen <chris.friesen@windriver.com>
This commit is contained in:
Tao Wang 2022-01-25 19:33:16 -05:00 committed by Chris Friesen
parent e0f3a93944
commit 098692ba0b
4 changed files with 395 additions and 0 deletions

View File

@ -0,0 +1,240 @@
From 6bf9795d5e0dfc705299381dc902b22d03ded063 Mon Sep 17 00:00:00 2001
From: Tao Wang <tao.wang@windriver.com>
Date: Tue, 25 Jan 2022 19:23:43 -0500
Subject: [PATCH] kubernetes: make isolcpus allocation SMT-aware
Enhance isolcpus support in Kubernetes to allocate isolated SMT
siblings to the same container when SMT/HT is enabled on the host.
As it stands, the device manager code in Kubernetes is not SMT-aware
(since normally it doesn't deal with CPUs). However, StarlingX
exposes isolated CPUs as devices and if possible we want to allocate
all SMT siblings from a CPU core to the same container in order to
minimize cross- container interference due to resource contention
within the CPU core.
The solution is basically to take the list of isolated CPUs and
re-order it so that the SMT siblings are next to each other. That
way the existing resource selection code will allocate the siblings
together. As an optimization, if it is known that an odd number
of isolated CPUs are desired, a singleton SMT sibling will be
inserted into the list to avoid breaking up sibling pairs.
Signed-off-by: Tao Wang <tao.wang@windriver.com>
---
pkg/kubelet/cm/devicemanager/manager.go | 153 ++++++++++++++++++++++--
1 file changed, 146 insertions(+), 7 deletions(-)
diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go
index 65b91393..76a0ea6e 100644
--- a/pkg/kubelet/cm/devicemanager/manager.go
+++ b/pkg/kubelet/cm/devicemanager/manager.go
@@ -19,10 +19,13 @@ package devicemanager
import (
"context"
"fmt"
+ "io/ioutil"
"net"
"os"
"path/filepath"
"sort"
+ "strconv"
+ "strings"
"sync"
"time"
@@ -41,6 +44,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
cputopology "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
+ "k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
"k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
@@ -635,6 +639,80 @@ func (m *ManagerImpl) UpdateAllocatedDevices() {
m.allocatedDevices = m.podDevices.devices()
}
+//Given a list of isolated CPUs in 'devices', and the number of desired CPUs in 'needed',
+//return an ordered list of isolated CPUs such that the first 'needed' CPUs in the list
+//contain as many hyperthread sibling pairs as possible.
+func order_devices_by_sibling(devices sets.String, needed int) ([]string, error) {
+ var dev_lst []string
+ var single_lst []string
+ sibling_lst := make([]string, 0, int(devices.Len()))
+ _iterated_cpu := make(map[string]string)
+
+ get_sibling := func(cpu string, cpu_lst []string) string {
+ if cpu_lst[0] == cpu {
+ return cpu_lst[1]
+ } else {
+ return cpu_lst[0]
+ }
+ }
+ for cpu_id := range devices {
+ // If we've already found cpu_id as a sibling, skip it.
+ if _, ok := _iterated_cpu[cpu_id]; ok {
+ continue
+ }
+ devPath := fmt.Sprintf("/sys/devices/system/cpu/cpu%s/topology/thread_siblings_list", cpu_id)
+ dat, err := ioutil.ReadFile(devPath)
+ if err != nil {
+ return dev_lst, fmt.Errorf("Can't read cpu[%s] thread_siblings_list", cpu_id)
+ }
+ cpustring := strings.TrimSuffix(string(dat), "\n")
+ cpu_pair_set, err := cpuset.Parse(cpustring)
+ if err != nil {
+ return dev_lst, fmt.Errorf("Unable to parse thread_siblings_list[%s] string to cpuset", cpustring)
+ }
+ var cpu_pair_lst []string
+ for _, v := range cpu_pair_set.ToSlice() {
+ cpu_pair_lst = append(cpu_pair_lst, strconv.Itoa(v))
+ }
+ sibling_cpu_id := get_sibling(cpu_id, cpu_pair_lst)
+
+ if _, ok := devices[sibling_cpu_id]; ok {
+ sibling_lst = append(sibling_lst, cpu_id, sibling_cpu_id)
+ _iterated_cpu[sibling_cpu_id] = ""
+ } else {
+ single_lst = append(single_lst, cpu_id)
+ }
+ _iterated_cpu[cpu_id] = ""
+ }
+
+ if needed%2 == 0 {
+ dev_lst = append(sibling_lst, single_lst...)
+ } else {
+ if len(single_lst) > 1 {
+ _tmp_list := append(sibling_lst, single_lst[1:]...)
+ dev_lst = append(single_lst[0:1], _tmp_list...)
+ } else {
+ if len(single_lst) == 0 {
+ dev_lst = sibling_lst
+ } else {
+ dev_lst = append(single_lst, sibling_lst...)
+ }
+
+ }
+ }
+ //klog.Infof("needed=%d ordered_cpu_list=%v", needed, dev_lst)
+ return dev_lst, nil
+}
+
+func smt_enabled() bool {
+ dat, _ := ioutil.ReadFile("/sys/devices/system/cpu/smt/active")
+ state := strings.TrimSuffix(string(dat), "\n")
+ if state == "0" {
+ return false
+ }
+ return true
+}
+
// Returns list of device Ids we need to allocate with Allocate rpc call.
// Returns empty list in case we don't need to issue the Allocate rpc call.
func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, required int, reusableDevices sets.String) (sets.String, error) {
@@ -664,13 +742,29 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
}
devices = sets.NewString()
// Allocates from reusableDevices list first.
- for device := range reusableDevices {
- devices.Insert(device)
- needed--
- if needed == 0 {
- return devices, nil
+ if resource == "windriver.com/isolcpus" && smt_enabled() {
+ _reusableDevices, err := order_devices_by_sibling(reusableDevices, needed)
+ if err != nil {
+ klog.Errorf("error in order_devices_by_sibling: %v", err)
+ }
+ // _reusableDevices is type of slice,So we need a unique loop to process here.
+ for _, device := range _reusableDevices {
+ devices.Insert(device)
+ needed--
+ if needed == 0 {
+ return devices, nil
+ }
+ }
+ } else {
+ for device := range reusableDevices {
+ devices.Insert(device)
+ needed--
+ if needed == 0 {
+ return devices, nil
+ }
}
}
+
// Needs to allocate additional devices.
if m.allocatedDevices[resource] == nil {
m.allocatedDevices[resource] = sets.NewString()
@@ -682,13 +776,25 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
if available.Len() < needed {
return nil, fmt.Errorf("requested number of devices unavailable for %s. Requested: %d, Available: %d", resource, needed, available.Len())
}
- // By default, pull devices from the unsorted list of available devices.
- allocated := available.UnsortedList()[:needed]
+
// If topology alignment is desired, update allocated to the set of devices
// with the best alignment.
+ var allocated []string
hint := m.topologyAffinityStore.GetAffinity(podUID, contName)
if m.deviceHasTopologyAlignment(resource) && hint.NUMANodeAffinity != nil {
allocated = m.takeByTopology(resource, available, hint.NUMANodeAffinity, needed)
+ } else {
+ if resource == "windriver.com/isolcpus" && smt_enabled() {
+ var err error
+ allocated, err = order_devices_by_sibling(available, needed)
+ allocated = allocated[:needed]
+ if err != nil {
+ klog.Errorf("error in order_devices_by_sibling: %v", err)
+ }
+ } else {
+ // By default, pull devices from the unsorted list of available devices.
+ allocated = available.UnsortedList()[:needed]
+ }
}
// Updates m.allocatedDevices with allocated devices to prevent them
// from being allocated to other pods/containers, given that we are
@@ -764,6 +870,39 @@ func (m *ManagerImpl) takeByTopology(resource string, available sets.String, aff
}
}
+ //Add specific logic to process isolcpus resource.
+ //Try to not sabotage the original logical structure.
+ //Sort the original three lists by sibling: fromAffinity,notFromAffinity,withoutTopology
+ if resource == "windriver.com/isolcpus" && smt_enabled() {
+ var err error
+ _request_device_map := make(sets.String)
+ for _, dev := range fromAffinity {
+ _request_device_map[dev] = sets.Empty{}
+ }
+ fromAffinity, err = order_devices_by_sibling(_request_device_map, request)
+ if err != nil {
+ klog.Errorf("error in order_devices_by_sibling: %v", err)
+ }
+
+ _request_device_map = make(sets.String)
+ for _, dev := range notFromAffinity {
+ _request_device_map[dev] = sets.Empty{}
+ }
+ notFromAffinity, err = order_devices_by_sibling(_request_device_map, request)
+ if err != nil {
+ klog.Errorf("error in order_devices_by_sibling: %v", err)
+ }
+
+ _request_device_map = make(sets.String)
+ for _, dev := range withoutTopology {
+ _request_device_map[dev] = sets.Empty{}
+ }
+ withoutTopology, err = order_devices_by_sibling(_request_device_map, request)
+ if err != nil {
+ klog.Errorf("error in order_devices_by_sibling: %v", err)
+ }
+ }
+
// Concatenate the lists above return the first 'request' devices from it..
return append(append(fromAffinity, notFromAffinity...), withoutTopology...)[:request]
}
--
2.22.5

View File

@ -62,6 +62,7 @@ Patch7: kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch
Patch8: Fix-exclusive-CPU-allocations-being-deleted-at-conta.patch
Patch9: kubeadm-create-platform-pods-with-zero-CPU-resources.patch
Patch10: add-option-to-disable-isolcpu-awareness.patch
Patch11: kubernetes-make-isolcpus-allocation-SMT-aware.patch
# It obsoletes cadvisor but needs its source code (literally integrated)
Obsoletes: cadvisor
@ -858,6 +859,7 @@ install in production.
%patch8 -p1
%patch9 -p1
%patch10 -p1
%patch11 -p1
#src/k8s.io/kubernetes/pkg/util/certificates
# Patch the code to remove eliptic.P224 support

View File

@ -0,0 +1,151 @@
From 95b7b6e1ddb25511c67a3d4018f62df1e76ee7bc Mon Sep 17 00:00:00 2001
From: Tao Wang <tao.wang@windriver.com>
Date: Tue, 25 Jan 2022 19:25:45 -0500
Subject: [PATCH] kubernetes: make isolcpus allocation SMT-aware
Enhance isolcpus support in Kubernetes to allocate isolated SMT
siblings to the same container when SMT/HT is enabled on the host.
As it stands, the device manager code in Kubernetes is not SMT-aware
(since normally it doesn't deal with CPUs). However, StarlingX
exposes isolated CPUs as devices and if possible we want to allocate
all SMT siblings from a CPU core to the same container in order to
minimize cross- container interference due to resource contention
within the CPU core.
The solution is basically to take the list of isolated CPUs and
re-order it so that the SMT siblings are next to each other. That
way the existing resource selection code will allocate the siblings
together. As an optimization, if it is known that an odd number
of isolated CPUs are desired, a singleton SMT sibling will be
inserted into the list to avoid breaking up sibling pairs.
Signed-off-by: Tao Wang <tao.wang@windriver.com>
---
pkg/kubelet/cm/devicemanager/manager.go | 84 ++++++++++++++++++++++++-
1 file changed, 83 insertions(+), 1 deletion(-)
diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go
index 60de14a9..609da8ed 100644
--- a/pkg/kubelet/cm/devicemanager/manager.go
+++ b/pkg/kubelet/cm/devicemanager/manager.go
@@ -19,11 +19,14 @@ package devicemanager
import (
"context"
"fmt"
+ "io/ioutil"
"net"
"os"
"path/filepath"
"runtime"
"sort"
+ "strconv"
+ "strings"
"sync"
"time"
@@ -41,6 +44,7 @@ import (
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
+ "k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
"k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/config"
@@ -667,6 +671,75 @@ func (m *ManagerImpl) UpdateAllocatedDevices() {
m.allocatedDevices = m.podDevices.devices()
}
+//Given a list of isolated CPUs in 'devices', and the number of desired CPUs in 'needed',
+//return an ordered list of isolated CPUs such that the first 'needed' CPUs in the list
+//contain as many hyperthread sibling pairs as possible.
+func order_devices_by_sibling(devices sets.String, needed int) ([]string, error) {
+ var dev_lst []string
+ var single_lst []string
+ sibling_lst := make([]string, 0, int(devices.Len()))
+ _iterated_cpu := make(map[string]string)
+ get_sibling := func(cpu string, cpu_lst []string) string {
+ if cpu_lst[0] == cpu {
+ return cpu_lst[1]
+ } else {
+ return cpu_lst[0]
+ }
+ }
+ for cpu_id := range devices {
+ // If we've already found cpu_id as a sibling, skip it.
+ if _, ok := _iterated_cpu[cpu_id]; ok {
+ continue
+ }
+ devPath := fmt.Sprintf("/sys/devices/system/cpu/cpu%s/topology/thread_siblings_list", cpu_id)
+ dat, err := ioutil.ReadFile(devPath)
+ if err != nil {
+ return dev_lst, fmt.Errorf("Can't read cpu[%s] thread_siblings_list", cpu_id)
+ }
+ cpustring := strings.TrimSuffix(string(dat), "\n")
+ cpu_pair_set, err := cpuset.Parse(cpustring)
+ if err != nil {
+ return dev_lst, fmt.Errorf("Unable to parse thread_siblings_list[%s] string to cpuset", cpustring)
+ }
+ var cpu_pair_lst []string
+ for _, v := range cpu_pair_set.ToSlice() {
+ cpu_pair_lst = append(cpu_pair_lst, strconv.Itoa(v))
+ }
+ sibling_cpu_id := get_sibling(cpu_id, cpu_pair_lst)
+ if _, ok := devices[sibling_cpu_id]; ok {
+ sibling_lst = append(sibling_lst, cpu_id, sibling_cpu_id)
+ _iterated_cpu[sibling_cpu_id] = ""
+ } else {
+ single_lst = append(single_lst, cpu_id)
+ }
+ _iterated_cpu[cpu_id] = ""
+ }
+ if needed%2 == 0 {
+ dev_lst = append(sibling_lst, single_lst...)
+ } else {
+ if len(single_lst) > 1 {
+ _tmp_list := append(sibling_lst, single_lst[1:]...)
+ dev_lst = append(single_lst[0:1], _tmp_list...)
+ } else {
+ if len(single_lst) == 0 {
+ dev_lst = sibling_lst
+ } else {
+ dev_lst = append(single_lst, sibling_lst...)
+ }
+ }
+ }
+ //klog.Infof("needed=%d ordered_cpu_list=%v", needed, dev_lst)
+ return dev_lst, nil
+}
+func smt_enabled() bool {
+ dat, _ := ioutil.ReadFile("/sys/devices/system/cpu/smt/active")
+ state := strings.TrimSuffix(string(dat), "\n")
+ if state == "0" {
+ return false
+ }
+ return true
+}
+
// Returns list of device Ids we need to allocate with Allocate rpc call.
// Returns empty list in case we don't need to issue the Allocate rpc call.
func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, required int, reusableDevices sets.String) (sets.String, error) {
@@ -702,7 +775,16 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
// Create a closure to help with device allocation
// Returns 'true' once no more devices need to be allocated.
allocateRemainingFrom := func(devices sets.String) bool {
- for device := range devices.Difference(allocated) {
+ availableDevices := devices.Difference(allocated).List()
+ // If we're dealing with isolcpus and SMT is enabled, reorder to group SMT siblings together.
+ if resource == "windriver.com/isolcpus" && len(devices) > 0 && smt_enabled() {
+ var err error
+ availableDevices, err = order_devices_by_sibling(devices.Difference(allocated), needed)
+ if err != nil {
+ klog.Errorf("error in order_devices_by_sibling: %v", err)
+ }
+ }
+ for _, device := range availableDevices {
m.allocatedDevices[resource].Insert(device)
allocated.Insert(device)
needed--
--
2.22.5

View File

@ -61,6 +61,7 @@ Patch5: kubeadm-create-platform-pods-with-zero-CPU-resources.patch
Patch6: enable-support-for-kubernetes-to-ignore-isolcpus.patch
Patch7: Revert-use-subpath-for-coredns-only-for-default-repo.patch
Patch8: Change-log-level-to-Debug.patch
Patch9: kubernetes-make-isolcpus-allocation-SMT-aware.patch
# It obsoletes cadvisor but needs its source code (literally integrated)
Obsoletes: cadvisor
@ -856,6 +857,7 @@ install in production.
%patch6 -p1
%patch7 -p1
%patch8 -p1
%patch9 -p1
#src/k8s.io/kubernetes/pkg/util/certificates
# Patch the code to remove eliptic.P224 support