Merge "AIO-DX swact task affinity robustness"

This commit is contained in:
Zuul 2021-05-20 01:42:04 +00:00 committed by Gerrit Code Review
commit 931887828b
2 changed files with 150 additions and 64 deletions

13
utilities/worker-utils/worker-utils/affine-tasks.sh Normal file → Executable file
View File

@ -68,6 +68,8 @@ LNAME=$(readlink -n -f $0)
NAME=$(basename $LNAME) NAME=$(basename $LNAME)
PIDFILE=/var/run/${NAME}.pid PIDFILE=/var/run/${NAME}.pid
TASK_AFFINING_INCOMPLETE="/etc/platform/.task_affining_incomplete"
# Define number of logical cpus # Define number of logical cpus
LOGICAL_CPUS=$(getconf _NPROCESSORS_ONLN) LOGICAL_CPUS=$(getconf _NPROCESSORS_ONLN)
@ -89,6 +91,11 @@ else
NONISOL_CPUS=${ONLINE_CPUS} NONISOL_CPUS=${ONLINE_CPUS}
NONISOL_MASK=${ONLINE_MASK} NONISOL_MASK=${ONLINE_MASK}
fi fi
# NONISOL_CPULIST is a space separated list, consumed by SM so that
# it knows about extra available cores
NONISOL_CPULIST=$(echo ${NONISOL_CPUS} | \
perl -pe 's/(\d+)-(\d+)/join(",",$1..$2)/eg'| \
sed 's/,/ /g')
# Define platform memory nodeset and cpuset # Define platform memory nodeset and cpuset
PLATFORM_NODES=$(cat /sys/devices/system/node/online) PLATFORM_NODES=$(cat /sys/devices/system/node/online)
@ -404,7 +411,8 @@ function affine_drbd_tasks {
} }
# Return list of reaffineable pids. This includes all processes, but excludes # Return list of reaffineable pids. This includes all processes, but excludes
# kernel threads, vSwitch, and anything in K8S, docker or qemu/kvm cpuset. # kernel threads, vSwitch, and anything in the cgroup cpusets: k8s-infra, docker,
# and machine.slice (i.e., qemu-kvm).
function reaffineable_pids { function reaffineable_pids {
local pids_excl local pids_excl
local pidlist local pidlist
@ -433,6 +441,8 @@ function affine_tasks_to_all_cores {
${NONISOL_CPUS} ${pid} > /dev/null 2>&1 ${NONISOL_CPUS} ${pid} > /dev/null 2>&1
done done
echo ${NONISOL_CPULIST} > ${TASK_AFFINING_INCOMPLETE}
LOG "Affined ${count} processes to all cores." LOG "Affined ${count} processes to all cores."
} }
@ -472,6 +482,7 @@ function affine_tasks_to_platform_cores {
taskset --pid --cpu-list 0 ${pid} > /dev/null 2>&1 taskset --pid --cpu-list 0 ${pid} > /dev/null 2>&1
done done
rm -v -f ${TASK_AFFINING_INCOMPLETE}
LOG "Affined ${count} processes to platform cores." LOG "Affined ${count} processes to platform cores."
} }

View File

@ -19,19 +19,16 @@
PATH=/bin:/usr/bin:/usr/local/bin PATH=/bin:/usr/bin:/usr/local/bin
. /etc/platform/platform.conf . /etc/platform/platform.conf
LOG_FUNCTIONS=${LOG_FUNCTIONS:-"/etc/init.d/log_functions.sh"}
CPUMAP_FUNCTIONS=${CPUMAP_FUNCTIONS:-"/etc/init.d/cpumap_functions.sh"} CPUMAP_FUNCTIONS=${CPUMAP_FUNCTIONS:-"/etc/init.d/cpumap_functions.sh"}
[[ -e ${LOG_FUNCTIONS} ]] && source ${LOG_FUNCTIONS}
[[ -e ${CPUMAP_FUNCTIONS} ]] && source ${CPUMAP_FUNCTIONS} [[ -e ${CPUMAP_FUNCTIONS} ]] && source ${CPUMAP_FUNCTIONS}
# Enable debug logs and tag them
LOG_DEBUG=1
TAG="TASKAFFINITY:"
TASK_AFFINING_INCOMPLETE="/etc/platform/.task_affining_incomplete" TASK_AFFINING_INCOMPLETE="/etc/platform/.task_affining_incomplete"
N_CPUS=$(getconf _NPROCESSORS_ONLN)
FULLSET_CPUS="0-"$((N_CPUS-1)) # The following CPULISTs are space separated lists of logical cpus,
FULLSET_MASK=$(cpulist_to_cpumap ${FULLSET_CPUS} ${N_CPUS}) # and are used by helper functions.
ISOL_CPULIST=$(/bin/cat /sys/devices/system/cpu/isolated | \
perl -pe 's/(\d+)-(\d+)/join(",",$1..$2)/eg'| \
sed 's/,/ /g')
PLATFORM_CPUS=$(platform_expanded_cpu_list) PLATFORM_CPUS=$(platform_expanded_cpu_list)
PLATFORM_CPULIST=$(platform_expanded_cpu_list| \ PLATFORM_CPULIST=$(platform_expanded_cpu_list| \
perl -pe 's/(\d+)-(\d+)/join(",",$1..$2)/eg'| \ perl -pe 's/(\d+)-(\d+)/join(",",$1..$2)/eg'| \
@ -43,8 +40,26 @@ if [[ $vswitch_type =~ none ]]; then
VSWITCH_CPULIST="" VSWITCH_CPULIST=""
fi fi
IDLE_MARK=95.0 PIDFILE=/var/run/affine-tasks.sh.pid
KERNEL=$(uname -a)
# Idle cpu occupancy threshold; logical cpus with greater idle occupancy
# than this will be included.
IDLEOCC_THRESHOLD=95.0
# Watch timeout to monitor removal of flag file; this is engineered as
# 2x the typical duration of a swact.
WATCH_TIMEOUT_SECONDS=90
# Log info message to /var/log/daemon.log
NAME="task-affine-functions"
LOG_FILE=/tmp/task-affine-functions.log
function LOG {
logger -p daemon.info -t "${NAME}($$): " "$@"
if [ ! -z "${LOG_FILE}" ]; then
local tstamp_H=$( date +"%Y-%0m-%0eT%H:%M:%S" )
echo -e "${tstamp_H} ${HOSTNAME} $0($$): info $@" >> ${LOG_FILE}
fi
}
################################################################################ ################################################################################
# Check if a given core is one of the platform cores # Check if a given core is one of the platform cores
@ -72,8 +87,22 @@ function is_vswitch_core {
return 0 return 0
} }
################################################################################
# Check if a given core is one of the isolcpus cores
################################################################################
function is_isolcpus_core {
local core=$1
for CPU in ${ISOL_CPULIST}; do
if [ $core -eq $CPU ]; then
return 1
fi
done
return 0
}
# Return list of reaffineable pids. This includes all processes, but excludes # Return list of reaffineable pids. This includes all processes, but excludes
# kernel threads, vSwitch, and anything in K8S or qemu/kvm. # kernel threads, vSwitch, and anything in the cgroup cpusets: k8s-infra, docker,
# and machine.slice (i.e., qemu-kvm).
function reaffineable_pids { function reaffineable_pids {
local pids_excl local pids_excl
local pidlist local pidlist
@ -83,7 +112,7 @@ function reaffineable_pids {
sed 's/,$/\n/') sed 's/,$/\n/')
pidlist=$(ps --ppid ${pids_excl} -p ${pids_excl} --deselect \ pidlist=$(ps --ppid ${pids_excl} -p ${pids_excl} --deselect \
-o pid=,cgroup= | \ -o pid=,cgroup= | \
awk '!/k8s-infra|machine.slice/ {print $1; }') awk '!/k8s-infra|docker|machine.slice/ {print $1; }')
echo "${pidlist[@]}" echo "${pidlist[@]}"
} }
@ -93,7 +122,7 @@ function reaffineable_pids {
# critical and cpu intensive operation in AIO. For instance, sm can levearage # critical and cpu intensive operation in AIO. For instance, sm can levearage
# the idle cores to speed up swact activity. # the idle cores to speed up swact activity.
# #
# At the end of the operation, regarless of the result, the service must be # At the end of the operation, regardless of the result, the service must be
# calling function affine_tasks_to_platform_cores to re-affine platform tasks # calling function affine_tasks_to_platform_cores to re-affine platform tasks
# back to their assigned core(s). # back to their assigned core(s).
# #
@ -101,61 +130,92 @@ function reaffineable_pids {
################################################################################ ################################################################################
function affine_tasks_to_idle_cores { function affine_tasks_to_idle_cores {
local cpulist local cpulist
local cpuocc_list
local vswitch_pid local vswitch_pid
local pidlist local pidlist
local idle_cpulist local idle_cpulist
local platform_cpus local platform_cpus
local count=0
local rc=0 local rc=0
local cpu=0
# Keep the last invocation of affining, truncate when we use idle cores
:> ${LOG_FILE}
# Ensure this only runs on AIO
if ! { [[ "$nodetype" = "controller" ]] && [[ $subfunction = *worker* ]]; }
then
LOG "Not AIO, nothing to do."
return $rc
fi
if [ -f ${TASK_AFFINING_INCOMPLETE} ]; then if [ -f ${TASK_AFFINING_INCOMPLETE} ]; then
read cpulist < ${TASK_AFFINING_INCOMPLETE} read cpulist < ${TASK_AFFINING_INCOMPLETE}
log_debug "${TAG} Tasks have already been affined to CPU ($cpulist)." LOG "Tasks have already been affined to CPU ($cpulist)."
return 0 return $rc
fi fi
if [[ "${KERNEL}" == *" RT "* ]]; then # Get idle cpu occupancy of all logical cores in the last 5 seconds.
return 0 declare -a cpuocc_list=( $(sar -P ALL 1 5 | grep Average | awk '{if(NR>2)print $8}') )
fi
# Compile a list of cpus with idle percentage greater than 95% in the last # Determine logical cpus that are considered platform, or application
# 5 seconds. # cores with idle percentage greater than 95%.
cpuocc_list=($(sar -P ALL 1 5|grep Average|awk '{if(NR>2)print $8}')) declare -a idle_cpus=()
for cpu in ${!cpuocc_list[@]}; do
for idle_value in ${cpuocc_list[@]}; do idleocc=${cpuocc_list[$cpu]}
is_vswitch_core $cpu is_vswitch_core $cpu
if [ $? -eq 1 ]; then if [ $? -eq 1 ]; then
cpu=$(($cpu+1)) continue
fi
is_isolcpus_core $cpu
if [ $? -eq 1 ]; then
continue continue
fi fi
is_platform_core $cpu is_platform_core $cpu
if [ $? -eq 1 ]; then if [ $? -eq 1 ]; then
# Platform core is added to the idle list by default idle_cpus+=( ${cpu} )
idle_cpulist=$idle_cpulist$cpu","
else else
# Non platform core is added to the idle list if it is more if [[ $(echo "${idleocc} > ${IDLEOCC_THRESHOLD}" | bc) -eq 1 ]]; then
# than 95% idle idle_cpus+=( ${cpu} )
if [[ $(echo "$idle_value > ${IDLE_MARK}"|bc) -eq 1 ]]; then
idle_cpulist=$idle_cpulist$cpu","
fi fi
fi fi
cpu=$(($cpu+1))
done done
idle_cpulist=$(echo $idle_cpulist|sed 's/.$//') # comma separated list of idle cpus
idle_cpulist=$(printf '%s,' "${idle_cpus[@]}")
idle_cpulist=${idle_cpulist%,}
log_debug "${TAG} Affining all tasks to idle CPU ($idle_cpulist)" LOG "Affining all tasks to idle CPU ($idle_cpulist)"
pidlist=( $(reaffineable_pids) ) pidlist=( $(reaffineable_pids) )
for pid in ${pidlist[@]}; do for pid in ${pidlist[@]}; do
count=$((${count} + 1))
taskset --all-tasks --pid --cpu-list \ taskset --all-tasks --pid --cpu-list \
${idle_cpulist} ${pid} > /dev/null 2>&1 ${idle_cpulist} ${pid} > /dev/null 2>&1
done done
# Save the cpu list to the temp file which will be read and removed when # Save the cpu list to the temp file which will be read and removed when
# tasks are reaffined to the platform cores later on. # tasks are reaffined to the platform cores later on.
# This list is consumed by SM so it knows about extra cores.
echo $idle_cpulist > ${TASK_AFFINING_INCOMPLETE} echo $idle_cpulist > ${TASK_AFFINING_INCOMPLETE}
LOG "Affined ${count} processes to idle cores."
# Wait for affining flag file to disappear. If the timeout period is reached,
# affine tasks back to platform cores.
watch_start_seconds=${SECONDS}
while [ -f ${TASK_AFFINING_INCOMPLETE} ]; do
elapsed_seconds=$(( ${SECONDS} - ${watch_start_seconds} ))
LOG "Waiting for swact to complete: ${elapsed_seconds} seconds."
if [ ${elapsed_seconds} -ge ${WATCH_TIMEOUT_SECONDS} ]; then
LOG "Exceeded watch timeout: ${WATCH_TIMEOUT_SECONDS} seconds," \
"affining tasks to platform cores."
affine_tasks_to_platform_cores
LOG "Idle cores watch completed," \
"tasks reaffined to platform cores."
break
fi
sleep 5
done
return $rc return $rc
} }
@ -164,24 +224,36 @@ function affine_tasks_to_idle_cores {
# to re-affine management tasks back to the platform cores. # to re-affine management tasks back to the platform cores.
################################################################################ ################################################################################
function affine_tasks_to_platform_cores { function affine_tasks_to_platform_cores {
local cpulist
local pidlist local pidlist
local rc=0 local rc=0
local count=0 local count=0
if [ ! -f ${TASK_AFFINING_INCOMPLETE} ]; then # Ensure this only runs on AIO
dbg_str="${TAG} Either tasks have never been affined to all/idle" if ! { [[ "$nodetype" = "controller" ]] && [[ $subfunction = *worker* ]]; }
dbg_str="${TAG} cores or they have already been reaffined to" then
dbg_str="${TAG} platform cores." LOG "Not AIO, nothing to do."
log_debug "$dbg_str" return $rc
return 0
fi fi
read cpulist < ${TASK_AFFINING_INCOMPLETE} # Abort if affine-tasks.sh is running
if [ -e ${PIDFILE} ]; then
pid=$(cat ${PIDFILE})
if [ -n "${pid}" -a -e /proc/${pid} ]; then
LOG "Aborting, ${pid} already running: ${PIDFILE}."
return $rc
fi
fi
log_debug "${TAG} Reaffining tasks to platform cores (${PLATFORM_CPUS})..." if [ ! -f ${TASK_AFFINING_INCOMPLETE} ]; then
LOG "Either tasks have never been affined to all/idle cores" \
"or they have already been reaffined to platform cores."
return $rc
fi
LOG "Reaffining tasks to platform cores (${PLATFORM_CPUS})..."
pidlist=( $(reaffineable_pids) ) pidlist=( $(reaffineable_pids) )
for pid in ${pidlist[@]}; do for pid in ${pidlist[@]}; do
count=$((${count} + 1))
taskset --all-tasks --pid --cpu-list \ taskset --all-tasks --pid --cpu-list \
${PLATFORM_CPUS} ${pid} > /dev/null 2>&1 ${PLATFORM_CPUS} ${pid} > /dev/null 2>&1
done done
@ -195,39 +267,42 @@ function affine_tasks_to_platform_cores {
taskset --pid --cpu-list ${PLATFORM_CPUS} {} > /dev/null 2>&1 taskset --pid --cpu-list ${PLATFORM_CPUS} {} > /dev/null 2>&1
done done
rm -rf ${TASK_AFFINING_INCOMPLETE} rm -v -f ${TASK_AFFINING_INCOMPLETE}
LOG "Affined ${count} processes to platform cores."
return $rc return $rc
} }
################################################################################ ################################################################################
# The following function can be leveraged by cron tasks # The following function returns a single logical cpu with greatest idle
# occupancy. This can be leveraged by cron tasks or other processes.
# (e.g., python-keystone)
################################################################################ ################################################################################
function get_most_idle_core { function get_most_idle_core {
local cpuocc_list local most_idle_value=${IDLEOCC_THRESHOLD}
local cpu=0
local most_idle_value=${IDLE_MARK}
local most_idle_cpu=0 local most_idle_cpu=0
if [[ "${KERNEL}" == *" RT "* ]]; then declare -a cpuocc_list=( $(sar -P ALL 1 5 | grep Average | awk '{if(NR>2)print $8}') )
echo $cpu
return
fi
cpuocc_list=($(sar -P ALL 1 5|grep Average|awk '{if(NR>2)print $8}')) for cpu in ${!cpuocc_list[@]}; do
idle_value=${cpuocc_list[$cpu]}
for idle_value in ${cpuocc_list[@]}; do
is_vswitch_core $cpu is_vswitch_core $cpu
if [ $? -eq 1 ]; then if [ $? -eq 1 ]; then
cpu=$(($cpu+1))
continue continue
fi fi
if [ $(echo "$idle_value > $most_idle_value"|bc) -eq 1 ]; then is_isolcpus_core $cpu
most_idle_value=$idle_value if [ $? -eq 1 ]; then
most_idle_cpu=$cpu continue
fi
if [ $(echo "${idle_value} > ${most_idle_value}" | bc) -eq 1 ]; then
most_idle_value=${idle_value}
most_idle_cpu=${cpu}
fi fi
cpu=$(($cpu+1))
done done
echo $most_idle_cpu LOG "get_most_idle_core: cpu=$most_idle_cpu, idleocc=$most_idle_value"
echo ${most_idle_cpu}
} }