Merge "AIO-DX swact task affinity robustness"
This commit is contained in:
commit
931887828b
13
utilities/worker-utils/worker-utils/affine-tasks.sh
Normal file → Executable file
13
utilities/worker-utils/worker-utils/affine-tasks.sh
Normal file → Executable file
@ -68,6 +68,8 @@ LNAME=$(readlink -n -f $0)
|
|||||||
NAME=$(basename $LNAME)
|
NAME=$(basename $LNAME)
|
||||||
PIDFILE=/var/run/${NAME}.pid
|
PIDFILE=/var/run/${NAME}.pid
|
||||||
|
|
||||||
|
TASK_AFFINING_INCOMPLETE="/etc/platform/.task_affining_incomplete"
|
||||||
|
|
||||||
# Define number of logical cpus
|
# Define number of logical cpus
|
||||||
LOGICAL_CPUS=$(getconf _NPROCESSORS_ONLN)
|
LOGICAL_CPUS=$(getconf _NPROCESSORS_ONLN)
|
||||||
|
|
||||||
@ -89,6 +91,11 @@ else
|
|||||||
NONISOL_CPUS=${ONLINE_CPUS}
|
NONISOL_CPUS=${ONLINE_CPUS}
|
||||||
NONISOL_MASK=${ONLINE_MASK}
|
NONISOL_MASK=${ONLINE_MASK}
|
||||||
fi
|
fi
|
||||||
|
# NONISOL_CPULIST is a space separated list, consumed by SM so that
|
||||||
|
# it knows about extra available cores
|
||||||
|
NONISOL_CPULIST=$(echo ${NONISOL_CPUS} | \
|
||||||
|
perl -pe 's/(\d+)-(\d+)/join(",",$1..$2)/eg'| \
|
||||||
|
sed 's/,/ /g')
|
||||||
|
|
||||||
# Define platform memory nodeset and cpuset
|
# Define platform memory nodeset and cpuset
|
||||||
PLATFORM_NODES=$(cat /sys/devices/system/node/online)
|
PLATFORM_NODES=$(cat /sys/devices/system/node/online)
|
||||||
@ -404,7 +411,8 @@ function affine_drbd_tasks {
|
|||||||
}
|
}
|
||||||
|
|
||||||
# Return list of reaffineable pids. This includes all processes, but excludes
|
# Return list of reaffineable pids. This includes all processes, but excludes
|
||||||
# kernel threads, vSwitch, and anything in K8S, docker or qemu/kvm cpuset.
|
# kernel threads, vSwitch, and anything in the cgroup cpusets: k8s-infra, docker,
|
||||||
|
# and machine.slice (i.e., qemu-kvm).
|
||||||
function reaffineable_pids {
|
function reaffineable_pids {
|
||||||
local pids_excl
|
local pids_excl
|
||||||
local pidlist
|
local pidlist
|
||||||
@ -433,6 +441,8 @@ function affine_tasks_to_all_cores {
|
|||||||
${NONISOL_CPUS} ${pid} > /dev/null 2>&1
|
${NONISOL_CPUS} ${pid} > /dev/null 2>&1
|
||||||
done
|
done
|
||||||
|
|
||||||
|
|
||||||
|
echo ${NONISOL_CPULIST} > ${TASK_AFFINING_INCOMPLETE}
|
||||||
LOG "Affined ${count} processes to all cores."
|
LOG "Affined ${count} processes to all cores."
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -472,6 +482,7 @@ function affine_tasks_to_platform_cores {
|
|||||||
taskset --pid --cpu-list 0 ${pid} > /dev/null 2>&1
|
taskset --pid --cpu-list 0 ${pid} > /dev/null 2>&1
|
||||||
done
|
done
|
||||||
|
|
||||||
|
rm -v -f ${TASK_AFFINING_INCOMPLETE}
|
||||||
LOG "Affined ${count} processes to platform cores."
|
LOG "Affined ${count} processes to platform cores."
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -19,19 +19,16 @@
|
|||||||
PATH=/bin:/usr/bin:/usr/local/bin
|
PATH=/bin:/usr/bin:/usr/local/bin
|
||||||
|
|
||||||
. /etc/platform/platform.conf
|
. /etc/platform/platform.conf
|
||||||
LOG_FUNCTIONS=${LOG_FUNCTIONS:-"/etc/init.d/log_functions.sh"}
|
|
||||||
CPUMAP_FUNCTIONS=${CPUMAP_FUNCTIONS:-"/etc/init.d/cpumap_functions.sh"}
|
CPUMAP_FUNCTIONS=${CPUMAP_FUNCTIONS:-"/etc/init.d/cpumap_functions.sh"}
|
||||||
[[ -e ${LOG_FUNCTIONS} ]] && source ${LOG_FUNCTIONS}
|
|
||||||
[[ -e ${CPUMAP_FUNCTIONS} ]] && source ${CPUMAP_FUNCTIONS}
|
[[ -e ${CPUMAP_FUNCTIONS} ]] && source ${CPUMAP_FUNCTIONS}
|
||||||
|
|
||||||
# Enable debug logs and tag them
|
|
||||||
LOG_DEBUG=1
|
|
||||||
TAG="TASKAFFINITY:"
|
|
||||||
|
|
||||||
TASK_AFFINING_INCOMPLETE="/etc/platform/.task_affining_incomplete"
|
TASK_AFFINING_INCOMPLETE="/etc/platform/.task_affining_incomplete"
|
||||||
N_CPUS=$(getconf _NPROCESSORS_ONLN)
|
|
||||||
FULLSET_CPUS="0-"$((N_CPUS-1))
|
# The following CPULISTs are space separated lists of logical cpus,
|
||||||
FULLSET_MASK=$(cpulist_to_cpumap ${FULLSET_CPUS} ${N_CPUS})
|
# and are used by helper functions.
|
||||||
|
ISOL_CPULIST=$(/bin/cat /sys/devices/system/cpu/isolated | \
|
||||||
|
perl -pe 's/(\d+)-(\d+)/join(",",$1..$2)/eg'| \
|
||||||
|
sed 's/,/ /g')
|
||||||
PLATFORM_CPUS=$(platform_expanded_cpu_list)
|
PLATFORM_CPUS=$(platform_expanded_cpu_list)
|
||||||
PLATFORM_CPULIST=$(platform_expanded_cpu_list| \
|
PLATFORM_CPULIST=$(platform_expanded_cpu_list| \
|
||||||
perl -pe 's/(\d+)-(\d+)/join(",",$1..$2)/eg'| \
|
perl -pe 's/(\d+)-(\d+)/join(",",$1..$2)/eg'| \
|
||||||
@ -43,8 +40,26 @@ if [[ $vswitch_type =~ none ]]; then
|
|||||||
VSWITCH_CPULIST=""
|
VSWITCH_CPULIST=""
|
||||||
fi
|
fi
|
||||||
|
|
||||||
IDLE_MARK=95.0
|
PIDFILE=/var/run/affine-tasks.sh.pid
|
||||||
KERNEL=$(uname -a)
|
|
||||||
|
# Idle cpu occupancy threshold; logical cpus with greater idle occupancy
|
||||||
|
# than this will be included.
|
||||||
|
IDLEOCC_THRESHOLD=95.0
|
||||||
|
|
||||||
|
# Watch timeout to monitor removal of flag file; this is engineered as
|
||||||
|
# 2x the typical duration of a swact.
|
||||||
|
WATCH_TIMEOUT_SECONDS=90
|
||||||
|
|
||||||
|
# Log info message to /var/log/daemon.log
|
||||||
|
NAME="task-affine-functions"
|
||||||
|
LOG_FILE=/tmp/task-affine-functions.log
|
||||||
|
function LOG {
|
||||||
|
logger -p daemon.info -t "${NAME}($$): " "$@"
|
||||||
|
if [ ! -z "${LOG_FILE}" ]; then
|
||||||
|
local tstamp_H=$( date +"%Y-%0m-%0eT%H:%M:%S" )
|
||||||
|
echo -e "${tstamp_H} ${HOSTNAME} $0($$): info $@" >> ${LOG_FILE}
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
# Check if a given core is one of the platform cores
|
# Check if a given core is one of the platform cores
|
||||||
@ -72,8 +87,22 @@ function is_vswitch_core {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Check if a given core is one of the isolcpus cores
|
||||||
|
################################################################################
|
||||||
|
function is_isolcpus_core {
|
||||||
|
local core=$1
|
||||||
|
for CPU in ${ISOL_CPULIST}; do
|
||||||
|
if [ $core -eq $CPU ]; then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
# Return list of reaffineable pids. This includes all processes, but excludes
|
# Return list of reaffineable pids. This includes all processes, but excludes
|
||||||
# kernel threads, vSwitch, and anything in K8S or qemu/kvm.
|
# kernel threads, vSwitch, and anything in the cgroup cpusets: k8s-infra, docker,
|
||||||
|
# and machine.slice (i.e., qemu-kvm).
|
||||||
function reaffineable_pids {
|
function reaffineable_pids {
|
||||||
local pids_excl
|
local pids_excl
|
||||||
local pidlist
|
local pidlist
|
||||||
@ -83,7 +112,7 @@ function reaffineable_pids {
|
|||||||
sed 's/,$/\n/')
|
sed 's/,$/\n/')
|
||||||
pidlist=$(ps --ppid ${pids_excl} -p ${pids_excl} --deselect \
|
pidlist=$(ps --ppid ${pids_excl} -p ${pids_excl} --deselect \
|
||||||
-o pid=,cgroup= | \
|
-o pid=,cgroup= | \
|
||||||
awk '!/k8s-infra|machine.slice/ {print $1; }')
|
awk '!/k8s-infra|docker|machine.slice/ {print $1; }')
|
||||||
echo "${pidlist[@]}"
|
echo "${pidlist[@]}"
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -93,7 +122,7 @@ function reaffineable_pids {
|
|||||||
# critical and cpu intensive operation in AIO. For instance, sm can levearage
|
# critical and cpu intensive operation in AIO. For instance, sm can levearage
|
||||||
# the idle cores to speed up swact activity.
|
# the idle cores to speed up swact activity.
|
||||||
#
|
#
|
||||||
# At the end of the operation, regarless of the result, the service must be
|
# At the end of the operation, regardless of the result, the service must be
|
||||||
# calling function affine_tasks_to_platform_cores to re-affine platform tasks
|
# calling function affine_tasks_to_platform_cores to re-affine platform tasks
|
||||||
# back to their assigned core(s).
|
# back to their assigned core(s).
|
||||||
#
|
#
|
||||||
@ -101,61 +130,92 @@ function reaffineable_pids {
|
|||||||
################################################################################
|
################################################################################
|
||||||
function affine_tasks_to_idle_cores {
|
function affine_tasks_to_idle_cores {
|
||||||
local cpulist
|
local cpulist
|
||||||
local cpuocc_list
|
|
||||||
local vswitch_pid
|
local vswitch_pid
|
||||||
local pidlist
|
local pidlist
|
||||||
local idle_cpulist
|
local idle_cpulist
|
||||||
local platform_cpus
|
local platform_cpus
|
||||||
|
local count=0
|
||||||
local rc=0
|
local rc=0
|
||||||
local cpu=0
|
|
||||||
|
# Keep the last invocation of affining, truncate when we use idle cores
|
||||||
|
:> ${LOG_FILE}
|
||||||
|
|
||||||
|
# Ensure this only runs on AIO
|
||||||
|
if ! { [[ "$nodetype" = "controller" ]] && [[ $subfunction = *worker* ]]; }
|
||||||
|
then
|
||||||
|
LOG "Not AIO, nothing to do."
|
||||||
|
return $rc
|
||||||
|
fi
|
||||||
|
|
||||||
if [ -f ${TASK_AFFINING_INCOMPLETE} ]; then
|
if [ -f ${TASK_AFFINING_INCOMPLETE} ]; then
|
||||||
read cpulist < ${TASK_AFFINING_INCOMPLETE}
|
read cpulist < ${TASK_AFFINING_INCOMPLETE}
|
||||||
log_debug "${TAG} Tasks have already been affined to CPU ($cpulist)."
|
LOG "Tasks have already been affined to CPU ($cpulist)."
|
||||||
return 0
|
return $rc
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ "${KERNEL}" == *" RT "* ]]; then
|
# Get idle cpu occupancy of all logical cores in the last 5 seconds.
|
||||||
return 0
|
declare -a cpuocc_list=( $(sar -P ALL 1 5 | grep Average | awk '{if(NR>2)print $8}') )
|
||||||
fi
|
|
||||||
|
|
||||||
# Compile a list of cpus with idle percentage greater than 95% in the last
|
# Determine logical cpus that are considered platform, or application
|
||||||
# 5 seconds.
|
# cores with idle percentage greater than 95%.
|
||||||
cpuocc_list=($(sar -P ALL 1 5|grep Average|awk '{if(NR>2)print $8}'))
|
declare -a idle_cpus=()
|
||||||
|
for cpu in ${!cpuocc_list[@]}; do
|
||||||
for idle_value in ${cpuocc_list[@]}; do
|
idleocc=${cpuocc_list[$cpu]}
|
||||||
is_vswitch_core $cpu
|
is_vswitch_core $cpu
|
||||||
if [ $? -eq 1 ]; then
|
if [ $? -eq 1 ]; then
|
||||||
cpu=$(($cpu+1))
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
is_isolcpus_core $cpu
|
||||||
|
if [ $? -eq 1 ]; then
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
is_platform_core $cpu
|
is_platform_core $cpu
|
||||||
if [ $? -eq 1 ]; then
|
if [ $? -eq 1 ]; then
|
||||||
# Platform core is added to the idle list by default
|
idle_cpus+=( ${cpu} )
|
||||||
idle_cpulist=$idle_cpulist$cpu","
|
|
||||||
else
|
else
|
||||||
# Non platform core is added to the idle list if it is more
|
if [[ $(echo "${idleocc} > ${IDLEOCC_THRESHOLD}" | bc) -eq 1 ]]; then
|
||||||
# than 95% idle
|
idle_cpus+=( ${cpu} )
|
||||||
if [[ $(echo "$idle_value > ${IDLE_MARK}"|bc) -eq 1 ]]; then
|
|
||||||
idle_cpulist=$idle_cpulist$cpu","
|
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
cpu=$(($cpu+1))
|
|
||||||
done
|
done
|
||||||
|
|
||||||
idle_cpulist=$(echo $idle_cpulist|sed 's/.$//')
|
# comma separated list of idle cpus
|
||||||
|
idle_cpulist=$(printf '%s,' "${idle_cpus[@]}")
|
||||||
|
idle_cpulist=${idle_cpulist%,}
|
||||||
|
|
||||||
log_debug "${TAG} Affining all tasks to idle CPU ($idle_cpulist)"
|
LOG "Affining all tasks to idle CPU ($idle_cpulist)"
|
||||||
pidlist=( $(reaffineable_pids) )
|
pidlist=( $(reaffineable_pids) )
|
||||||
for pid in ${pidlist[@]}; do
|
for pid in ${pidlist[@]}; do
|
||||||
|
count=$((${count} + 1))
|
||||||
taskset --all-tasks --pid --cpu-list \
|
taskset --all-tasks --pid --cpu-list \
|
||||||
${idle_cpulist} ${pid} > /dev/null 2>&1
|
${idle_cpulist} ${pid} > /dev/null 2>&1
|
||||||
done
|
done
|
||||||
|
|
||||||
# Save the cpu list to the temp file which will be read and removed when
|
# Save the cpu list to the temp file which will be read and removed when
|
||||||
# tasks are reaffined to the platform cores later on.
|
# tasks are reaffined to the platform cores later on.
|
||||||
|
# This list is consumed by SM so it knows about extra cores.
|
||||||
echo $idle_cpulist > ${TASK_AFFINING_INCOMPLETE}
|
echo $idle_cpulist > ${TASK_AFFINING_INCOMPLETE}
|
||||||
|
LOG "Affined ${count} processes to idle cores."
|
||||||
|
|
||||||
|
# Wait for affining flag file to disappear. If the timeout period is reached,
|
||||||
|
# affine tasks back to platform cores.
|
||||||
|
watch_start_seconds=${SECONDS}
|
||||||
|
while [ -f ${TASK_AFFINING_INCOMPLETE} ]; do
|
||||||
|
elapsed_seconds=$(( ${SECONDS} - ${watch_start_seconds} ))
|
||||||
|
LOG "Waiting for swact to complete: ${elapsed_seconds} seconds."
|
||||||
|
if [ ${elapsed_seconds} -ge ${WATCH_TIMEOUT_SECONDS} ]; then
|
||||||
|
LOG "Exceeded watch timeout: ${WATCH_TIMEOUT_SECONDS} seconds," \
|
||||||
|
"affining tasks to platform cores."
|
||||||
|
affine_tasks_to_platform_cores
|
||||||
|
LOG "Idle cores watch completed," \
|
||||||
|
"tasks reaffined to platform cores."
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
sleep 5
|
||||||
|
done
|
||||||
|
|
||||||
return $rc
|
return $rc
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -164,24 +224,36 @@ function affine_tasks_to_idle_cores {
|
|||||||
# to re-affine management tasks back to the platform cores.
|
# to re-affine management tasks back to the platform cores.
|
||||||
################################################################################
|
################################################################################
|
||||||
function affine_tasks_to_platform_cores {
|
function affine_tasks_to_platform_cores {
|
||||||
local cpulist
|
|
||||||
local pidlist
|
local pidlist
|
||||||
local rc=0
|
local rc=0
|
||||||
local count=0
|
local count=0
|
||||||
|
|
||||||
if [ ! -f ${TASK_AFFINING_INCOMPLETE} ]; then
|
# Ensure this only runs on AIO
|
||||||
dbg_str="${TAG} Either tasks have never been affined to all/idle"
|
if ! { [[ "$nodetype" = "controller" ]] && [[ $subfunction = *worker* ]]; }
|
||||||
dbg_str="${TAG} cores or they have already been reaffined to"
|
then
|
||||||
dbg_str="${TAG} platform cores."
|
LOG "Not AIO, nothing to do."
|
||||||
log_debug "$dbg_str"
|
return $rc
|
||||||
return 0
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
read cpulist < ${TASK_AFFINING_INCOMPLETE}
|
# Abort if affine-tasks.sh is running
|
||||||
|
if [ -e ${PIDFILE} ]; then
|
||||||
|
pid=$(cat ${PIDFILE})
|
||||||
|
if [ -n "${pid}" -a -e /proc/${pid} ]; then
|
||||||
|
LOG "Aborting, ${pid} already running: ${PIDFILE}."
|
||||||
|
return $rc
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
log_debug "${TAG} Reaffining tasks to platform cores (${PLATFORM_CPUS})..."
|
if [ ! -f ${TASK_AFFINING_INCOMPLETE} ]; then
|
||||||
|
LOG "Either tasks have never been affined to all/idle cores" \
|
||||||
|
"or they have already been reaffined to platform cores."
|
||||||
|
return $rc
|
||||||
|
fi
|
||||||
|
|
||||||
|
LOG "Reaffining tasks to platform cores (${PLATFORM_CPUS})..."
|
||||||
pidlist=( $(reaffineable_pids) )
|
pidlist=( $(reaffineable_pids) )
|
||||||
for pid in ${pidlist[@]}; do
|
for pid in ${pidlist[@]}; do
|
||||||
|
count=$((${count} + 1))
|
||||||
taskset --all-tasks --pid --cpu-list \
|
taskset --all-tasks --pid --cpu-list \
|
||||||
${PLATFORM_CPUS} ${pid} > /dev/null 2>&1
|
${PLATFORM_CPUS} ${pid} > /dev/null 2>&1
|
||||||
done
|
done
|
||||||
@ -195,39 +267,42 @@ function affine_tasks_to_platform_cores {
|
|||||||
taskset --pid --cpu-list ${PLATFORM_CPUS} {} > /dev/null 2>&1
|
taskset --pid --cpu-list ${PLATFORM_CPUS} {} > /dev/null 2>&1
|
||||||
done
|
done
|
||||||
|
|
||||||
rm -rf ${TASK_AFFINING_INCOMPLETE}
|
rm -v -f ${TASK_AFFINING_INCOMPLETE}
|
||||||
|
LOG "Affined ${count} processes to platform cores."
|
||||||
|
|
||||||
return $rc
|
return $rc
|
||||||
}
|
}
|
||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
# The following function can be leveraged by cron tasks
|
# The following function returns a single logical cpu with greatest idle
|
||||||
|
# occupancy. This can be leveraged by cron tasks or other processes.
|
||||||
|
# (e.g., python-keystone)
|
||||||
################################################################################
|
################################################################################
|
||||||
function get_most_idle_core {
|
function get_most_idle_core {
|
||||||
local cpuocc_list
|
local most_idle_value=${IDLEOCC_THRESHOLD}
|
||||||
local cpu=0
|
|
||||||
local most_idle_value=${IDLE_MARK}
|
|
||||||
local most_idle_cpu=0
|
local most_idle_cpu=0
|
||||||
|
|
||||||
if [[ "${KERNEL}" == *" RT "* ]]; then
|
declare -a cpuocc_list=( $(sar -P ALL 1 5 | grep Average | awk '{if(NR>2)print $8}') )
|
||||||
echo $cpu
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
|
|
||||||
cpuocc_list=($(sar -P ALL 1 5|grep Average|awk '{if(NR>2)print $8}'))
|
for cpu in ${!cpuocc_list[@]}; do
|
||||||
|
idle_value=${cpuocc_list[$cpu]}
|
||||||
for idle_value in ${cpuocc_list[@]}; do
|
|
||||||
is_vswitch_core $cpu
|
is_vswitch_core $cpu
|
||||||
if [ $? -eq 1 ]; then
|
if [ $? -eq 1 ]; then
|
||||||
cpu=$(($cpu+1))
|
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $(echo "$idle_value > $most_idle_value"|bc) -eq 1 ]; then
|
is_isolcpus_core $cpu
|
||||||
most_idle_value=$idle_value
|
if [ $? -eq 1 ]; then
|
||||||
most_idle_cpu=$cpu
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $(echo "${idle_value} > ${most_idle_value}" | bc) -eq 1 ]; then
|
||||||
|
most_idle_value=${idle_value}
|
||||||
|
most_idle_cpu=${cpu}
|
||||||
fi
|
fi
|
||||||
cpu=$(($cpu+1))
|
|
||||||
done
|
done
|
||||||
|
|
||||||
echo $most_idle_cpu
|
LOG "get_most_idle_core: cpu=$most_idle_cpu, idleocc=$most_idle_value"
|
||||||
|
echo ${most_idle_cpu}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user