From bb2932c067c410d6d9f22e1638cf5484f55cdfb1 Mon Sep 17 00:00:00 2001 From: Jim Gauld Date: Tue, 18 May 2021 13:48:53 -0400 Subject: [PATCH] AIO-DX swact task affinity robustness Task affinity functions are used to speedup initialization of AIO and swact on AIO-DX. When swact occurs, SM leverages task affining scripts to move platform tasks to idle cores, followed by moving platform tasks back to platform cores at the end of the swact. This change adds a timeout of 90 seconds so that tasks are always affined back to platform cores even if the swact does not complete (e.g., due to failed or disabled minor service). This also corrects interactions of the task_affinity_functions with the affine-tasks.sh init script by checking if the service is running, and by updating/removing a common flag file. This will also improve the task affinity handling of openstack installation and startup, since the affine-tasks.sh script assumes tasks float across cores until nova-compute is providing service. Closes-Bug: 1928836 Signed-off-by: Jim Gauld Change-Id: Ief5c65103f98e9ffb57f96327af1e0dd35d13857 --- .../worker-utils/worker-utils/affine-tasks.sh | 13 +- .../worker-utils/task_affinity_functions.sh | 201 ++++++++++++------ 2 files changed, 150 insertions(+), 64 deletions(-) mode change 100644 => 100755 utilities/worker-utils/worker-utils/affine-tasks.sh diff --git a/utilities/worker-utils/worker-utils/affine-tasks.sh b/utilities/worker-utils/worker-utils/affine-tasks.sh old mode 100644 new mode 100755 index 3f6d294e..6d851343 --- a/utilities/worker-utils/worker-utils/affine-tasks.sh +++ b/utilities/worker-utils/worker-utils/affine-tasks.sh @@ -68,6 +68,8 @@ LNAME=$(readlink -n -f $0) NAME=$(basename $LNAME) PIDFILE=/var/run/${NAME}.pid +TASK_AFFINING_INCOMPLETE="/etc/platform/.task_affining_incomplete" + # Define number of logical cpus LOGICAL_CPUS=$(getconf _NPROCESSORS_ONLN) @@ -89,6 +91,11 @@ else NONISOL_CPUS=${ONLINE_CPUS} NONISOL_MASK=${ONLINE_MASK} fi +# NONISOL_CPULIST is a space separated list, consumed by SM so that +# it knows about extra available cores +NONISOL_CPULIST=$(echo ${NONISOL_CPUS} | \ + perl -pe 's/(\d+)-(\d+)/join(",",$1..$2)/eg'| \ + sed 's/,/ /g') # Define platform memory nodeset and cpuset PLATFORM_NODES=$(cat /sys/devices/system/node/online) @@ -404,7 +411,8 @@ function affine_drbd_tasks { } # Return list of reaffineable pids. This includes all processes, but excludes -# kernel threads, vSwitch, and anything in K8S, docker or qemu/kvm cpuset. +# kernel threads, vSwitch, and anything in the cgroup cpusets: k8s-infra, docker, +# and machine.slice (i.e., qemu-kvm). function reaffineable_pids { local pids_excl local pidlist @@ -433,6 +441,8 @@ function affine_tasks_to_all_cores { ${NONISOL_CPUS} ${pid} > /dev/null 2>&1 done + + echo ${NONISOL_CPULIST} > ${TASK_AFFINING_INCOMPLETE} LOG "Affined ${count} processes to all cores." } @@ -472,6 +482,7 @@ function affine_tasks_to_platform_cores { taskset --pid --cpu-list 0 ${pid} > /dev/null 2>&1 done + rm -v -f ${TASK_AFFINING_INCOMPLETE} LOG "Affined ${count} processes to platform cores." } diff --git a/utilities/worker-utils/worker-utils/task_affinity_functions.sh b/utilities/worker-utils/worker-utils/task_affinity_functions.sh index 4b184310..a6436e55 100755 --- a/utilities/worker-utils/worker-utils/task_affinity_functions.sh +++ b/utilities/worker-utils/worker-utils/task_affinity_functions.sh @@ -19,19 +19,16 @@ PATH=/bin:/usr/bin:/usr/local/bin . /etc/platform/platform.conf -LOG_FUNCTIONS=${LOG_FUNCTIONS:-"/etc/init.d/log_functions.sh"} CPUMAP_FUNCTIONS=${CPUMAP_FUNCTIONS:-"/etc/init.d/cpumap_functions.sh"} -[[ -e ${LOG_FUNCTIONS} ]] && source ${LOG_FUNCTIONS} [[ -e ${CPUMAP_FUNCTIONS} ]] && source ${CPUMAP_FUNCTIONS} -# Enable debug logs and tag them -LOG_DEBUG=1 -TAG="TASKAFFINITY:" - TASK_AFFINING_INCOMPLETE="/etc/platform/.task_affining_incomplete" -N_CPUS=$(getconf _NPROCESSORS_ONLN) -FULLSET_CPUS="0-"$((N_CPUS-1)) -FULLSET_MASK=$(cpulist_to_cpumap ${FULLSET_CPUS} ${N_CPUS}) + +# The following CPULISTs are space separated lists of logical cpus, +# and are used by helper functions. +ISOL_CPULIST=$(/bin/cat /sys/devices/system/cpu/isolated | \ + perl -pe 's/(\d+)-(\d+)/join(",",$1..$2)/eg'| \ + sed 's/,/ /g') PLATFORM_CPUS=$(platform_expanded_cpu_list) PLATFORM_CPULIST=$(platform_expanded_cpu_list| \ perl -pe 's/(\d+)-(\d+)/join(",",$1..$2)/eg'| \ @@ -43,8 +40,26 @@ if [[ $vswitch_type =~ none ]]; then VSWITCH_CPULIST="" fi -IDLE_MARK=95.0 -KERNEL=$(uname -a) +PIDFILE=/var/run/affine-tasks.sh.pid + +# Idle cpu occupancy threshold; logical cpus with greater idle occupancy +# than this will be included. +IDLEOCC_THRESHOLD=95.0 + +# Watch timeout to monitor removal of flag file; this is engineered as +# 2x the typical duration of a swact. +WATCH_TIMEOUT_SECONDS=90 + +# Log info message to /var/log/daemon.log +NAME="task-affine-functions" +LOG_FILE=/tmp/task-affine-functions.log +function LOG { + logger -p daemon.info -t "${NAME}($$): " "$@" + if [ ! -z "${LOG_FILE}" ]; then + local tstamp_H=$( date +"%Y-%0m-%0eT%H:%M:%S" ) + echo -e "${tstamp_H} ${HOSTNAME} $0($$): info $@" >> ${LOG_FILE} + fi +} ################################################################################ # Check if a given core is one of the platform cores @@ -72,8 +87,22 @@ function is_vswitch_core { return 0 } +################################################################################ +# Check if a given core is one of the isolcpus cores +################################################################################ +function is_isolcpus_core { + local core=$1 + for CPU in ${ISOL_CPULIST}; do + if [ $core -eq $CPU ]; then + return 1 + fi + done + return 0 +} + # Return list of reaffineable pids. This includes all processes, but excludes -# kernel threads, vSwitch, and anything in K8S or qemu/kvm. +# kernel threads, vSwitch, and anything in the cgroup cpusets: k8s-infra, docker, +# and machine.slice (i.e., qemu-kvm). function reaffineable_pids { local pids_excl local pidlist @@ -83,7 +112,7 @@ function reaffineable_pids { sed 's/,$/\n/') pidlist=$(ps --ppid ${pids_excl} -p ${pids_excl} --deselect \ -o pid=,cgroup= | \ - awk '!/k8s-infra|machine.slice/ {print $1; }') + awk '!/k8s-infra|docker|machine.slice/ {print $1; }') echo "${pidlist[@]}" } @@ -93,7 +122,7 @@ function reaffineable_pids { # critical and cpu intensive operation in AIO. For instance, sm can levearage # the idle cores to speed up swact activity. # -# At the end of the operation, regarless of the result, the service must be +# At the end of the operation, regardless of the result, the service must be # calling function affine_tasks_to_platform_cores to re-affine platform tasks # back to their assigned core(s). # @@ -101,61 +130,92 @@ function reaffineable_pids { ################################################################################ function affine_tasks_to_idle_cores { local cpulist - local cpuocc_list local vswitch_pid local pidlist local idle_cpulist local platform_cpus + local count=0 local rc=0 - local cpu=0 + + # Keep the last invocation of affining, truncate when we use idle cores + :> ${LOG_FILE} + + # Ensure this only runs on AIO + if ! { [[ "$nodetype" = "controller" ]] && [[ $subfunction = *worker* ]]; } + then + LOG "Not AIO, nothing to do." + return $rc + fi if [ -f ${TASK_AFFINING_INCOMPLETE} ]; then read cpulist < ${TASK_AFFINING_INCOMPLETE} - log_debug "${TAG} Tasks have already been affined to CPU ($cpulist)." - return 0 + LOG "Tasks have already been affined to CPU ($cpulist)." + return $rc fi - if [[ "${KERNEL}" == *" RT "* ]]; then - return 0 - fi + # Get idle cpu occupancy of all logical cores in the last 5 seconds. + declare -a cpuocc_list=( $(sar -P ALL 1 5 | grep Average | awk '{if(NR>2)print $8}') ) - # Compile a list of cpus with idle percentage greater than 95% in the last - # 5 seconds. - cpuocc_list=($(sar -P ALL 1 5|grep Average|awk '{if(NR>2)print $8}')) - - for idle_value in ${cpuocc_list[@]}; do + # Determine logical cpus that are considered platform, or application + # cores with idle percentage greater than 95%. + declare -a idle_cpus=() + for cpu in ${!cpuocc_list[@]}; do + idleocc=${cpuocc_list[$cpu]} is_vswitch_core $cpu if [ $? -eq 1 ]; then - cpu=$(($cpu+1)) + continue + fi + + is_isolcpus_core $cpu + if [ $? -eq 1 ]; then continue fi is_platform_core $cpu if [ $? -eq 1 ]; then - # Platform core is added to the idle list by default - idle_cpulist=$idle_cpulist$cpu"," + idle_cpus+=( ${cpu} ) else - # Non platform core is added to the idle list if it is more - # than 95% idle - if [[ $(echo "$idle_value > ${IDLE_MARK}"|bc) -eq 1 ]]; then - idle_cpulist=$idle_cpulist$cpu"," + if [[ $(echo "${idleocc} > ${IDLEOCC_THRESHOLD}" | bc) -eq 1 ]]; then + idle_cpus+=( ${cpu} ) fi fi - cpu=$(($cpu+1)) done - idle_cpulist=$(echo $idle_cpulist|sed 's/.$//') + # comma separated list of idle cpus + idle_cpulist=$(printf '%s,' "${idle_cpus[@]}") + idle_cpulist=${idle_cpulist%,} - log_debug "${TAG} Affining all tasks to idle CPU ($idle_cpulist)" + LOG "Affining all tasks to idle CPU ($idle_cpulist)" pidlist=( $(reaffineable_pids) ) for pid in ${pidlist[@]}; do + count=$((${count} + 1)) taskset --all-tasks --pid --cpu-list \ ${idle_cpulist} ${pid} > /dev/null 2>&1 done # Save the cpu list to the temp file which will be read and removed when # tasks are reaffined to the platform cores later on. + # This list is consumed by SM so it knows about extra cores. echo $idle_cpulist > ${TASK_AFFINING_INCOMPLETE} + LOG "Affined ${count} processes to idle cores." + + # Wait for affining flag file to disappear. If the timeout period is reached, + # affine tasks back to platform cores. + watch_start_seconds=${SECONDS} + while [ -f ${TASK_AFFINING_INCOMPLETE} ]; do + elapsed_seconds=$(( ${SECONDS} - ${watch_start_seconds} )) + LOG "Waiting for swact to complete: ${elapsed_seconds} seconds." + if [ ${elapsed_seconds} -ge ${WATCH_TIMEOUT_SECONDS} ]; then + LOG "Exceeded watch timeout: ${WATCH_TIMEOUT_SECONDS} seconds," \ + "affining tasks to platform cores." + affine_tasks_to_platform_cores + LOG "Idle cores watch completed," \ + "tasks reaffined to platform cores." + break + fi + sleep 5 + done + return $rc } @@ -164,24 +224,36 @@ function affine_tasks_to_idle_cores { # to re-affine management tasks back to the platform cores. ################################################################################ function affine_tasks_to_platform_cores { - local cpulist local pidlist local rc=0 local count=0 - if [ ! -f ${TASK_AFFINING_INCOMPLETE} ]; then - dbg_str="${TAG} Either tasks have never been affined to all/idle" - dbg_str="${TAG} cores or they have already been reaffined to" - dbg_str="${TAG} platform cores." - log_debug "$dbg_str" - return 0 + # Ensure this only runs on AIO + if ! { [[ "$nodetype" = "controller" ]] && [[ $subfunction = *worker* ]]; } + then + LOG "Not AIO, nothing to do." + return $rc fi - read cpulist < ${TASK_AFFINING_INCOMPLETE} + # Abort if affine-tasks.sh is running + if [ -e ${PIDFILE} ]; then + pid=$(cat ${PIDFILE}) + if [ -n "${pid}" -a -e /proc/${pid} ]; then + LOG "Aborting, ${pid} already running: ${PIDFILE}." + return $rc + fi + fi - log_debug "${TAG} Reaffining tasks to platform cores (${PLATFORM_CPUS})..." + if [ ! -f ${TASK_AFFINING_INCOMPLETE} ]; then + LOG "Either tasks have never been affined to all/idle cores" \ + "or they have already been reaffined to platform cores." + return $rc + fi + + LOG "Reaffining tasks to platform cores (${PLATFORM_CPUS})..." pidlist=( $(reaffineable_pids) ) for pid in ${pidlist[@]}; do + count=$((${count} + 1)) taskset --all-tasks --pid --cpu-list \ ${PLATFORM_CPUS} ${pid} > /dev/null 2>&1 done @@ -195,39 +267,42 @@ function affine_tasks_to_platform_cores { taskset --pid --cpu-list ${PLATFORM_CPUS} {} > /dev/null 2>&1 done - rm -rf ${TASK_AFFINING_INCOMPLETE} + rm -v -f ${TASK_AFFINING_INCOMPLETE} + LOG "Affined ${count} processes to platform cores." + return $rc } ################################################################################ -# The following function can be leveraged by cron tasks +# The following function returns a single logical cpu with greatest idle +# occupancy. This can be leveraged by cron tasks or other processes. +# (e.g., python-keystone) ################################################################################ function get_most_idle_core { - local cpuocc_list - local cpu=0 - local most_idle_value=${IDLE_MARK} + local most_idle_value=${IDLEOCC_THRESHOLD} local most_idle_cpu=0 - if [[ "${KERNEL}" == *" RT "* ]]; then - echo $cpu - return - fi + declare -a cpuocc_list=( $(sar -P ALL 1 5 | grep Average | awk '{if(NR>2)print $8}') ) - cpuocc_list=($(sar -P ALL 1 5|grep Average|awk '{if(NR>2)print $8}')) - - for idle_value in ${cpuocc_list[@]}; do + for cpu in ${!cpuocc_list[@]}; do + idle_value=${cpuocc_list[$cpu]} is_vswitch_core $cpu if [ $? -eq 1 ]; then - cpu=$(($cpu+1)) continue fi - if [ $(echo "$idle_value > $most_idle_value"|bc) -eq 1 ]; then - most_idle_value=$idle_value - most_idle_cpu=$cpu + is_isolcpus_core $cpu + if [ $? -eq 1 ]; then + continue + fi + + if [ $(echo "${idle_value} > ${most_idle_value}" | bc) -eq 1 ]; then + most_idle_value=${idle_value} + most_idle_cpu=${cpu} fi - cpu=$(($cpu+1)) done - echo $most_idle_cpu + LOG "get_most_idle_core: cpu=$most_idle_cpu, idleocc=$most_idle_value" + echo ${most_idle_cpu} } +