diff --git a/utilities/worker-utils/worker-utils/affine-tasks.sh b/utilities/worker-utils/worker-utils/affine-tasks.sh old mode 100644 new mode 100755 index 3f6d294e..6d851343 --- a/utilities/worker-utils/worker-utils/affine-tasks.sh +++ b/utilities/worker-utils/worker-utils/affine-tasks.sh @@ -68,6 +68,8 @@ LNAME=$(readlink -n -f $0) NAME=$(basename $LNAME) PIDFILE=/var/run/${NAME}.pid +TASK_AFFINING_INCOMPLETE="/etc/platform/.task_affining_incomplete" + # Define number of logical cpus LOGICAL_CPUS=$(getconf _NPROCESSORS_ONLN) @@ -89,6 +91,11 @@ else NONISOL_CPUS=${ONLINE_CPUS} NONISOL_MASK=${ONLINE_MASK} fi +# NONISOL_CPULIST is a space separated list, consumed by SM so that +# it knows about extra available cores +NONISOL_CPULIST=$(echo ${NONISOL_CPUS} | \ + perl -pe 's/(\d+)-(\d+)/join(",",$1..$2)/eg'| \ + sed 's/,/ /g') # Define platform memory nodeset and cpuset PLATFORM_NODES=$(cat /sys/devices/system/node/online) @@ -404,7 +411,8 @@ function affine_drbd_tasks { } # Return list of reaffineable pids. This includes all processes, but excludes -# kernel threads, vSwitch, and anything in K8S, docker or qemu/kvm cpuset. +# kernel threads, vSwitch, and anything in the cgroup cpusets: k8s-infra, docker, +# and machine.slice (i.e., qemu-kvm). function reaffineable_pids { local pids_excl local pidlist @@ -433,6 +441,8 @@ function affine_tasks_to_all_cores { ${NONISOL_CPUS} ${pid} > /dev/null 2>&1 done + + echo ${NONISOL_CPULIST} > ${TASK_AFFINING_INCOMPLETE} LOG "Affined ${count} processes to all cores." } @@ -472,6 +482,7 @@ function affine_tasks_to_platform_cores { taskset --pid --cpu-list 0 ${pid} > /dev/null 2>&1 done + rm -v -f ${TASK_AFFINING_INCOMPLETE} LOG "Affined ${count} processes to platform cores." } diff --git a/utilities/worker-utils/worker-utils/task_affinity_functions.sh b/utilities/worker-utils/worker-utils/task_affinity_functions.sh index 4b184310..a6436e55 100755 --- a/utilities/worker-utils/worker-utils/task_affinity_functions.sh +++ b/utilities/worker-utils/worker-utils/task_affinity_functions.sh @@ -19,19 +19,16 @@ PATH=/bin:/usr/bin:/usr/local/bin . /etc/platform/platform.conf -LOG_FUNCTIONS=${LOG_FUNCTIONS:-"/etc/init.d/log_functions.sh"} CPUMAP_FUNCTIONS=${CPUMAP_FUNCTIONS:-"/etc/init.d/cpumap_functions.sh"} -[[ -e ${LOG_FUNCTIONS} ]] && source ${LOG_FUNCTIONS} [[ -e ${CPUMAP_FUNCTIONS} ]] && source ${CPUMAP_FUNCTIONS} -# Enable debug logs and tag them -LOG_DEBUG=1 -TAG="TASKAFFINITY:" - TASK_AFFINING_INCOMPLETE="/etc/platform/.task_affining_incomplete" -N_CPUS=$(getconf _NPROCESSORS_ONLN) -FULLSET_CPUS="0-"$((N_CPUS-1)) -FULLSET_MASK=$(cpulist_to_cpumap ${FULLSET_CPUS} ${N_CPUS}) + +# The following CPULISTs are space separated lists of logical cpus, +# and are used by helper functions. +ISOL_CPULIST=$(/bin/cat /sys/devices/system/cpu/isolated | \ + perl -pe 's/(\d+)-(\d+)/join(",",$1..$2)/eg'| \ + sed 's/,/ /g') PLATFORM_CPUS=$(platform_expanded_cpu_list) PLATFORM_CPULIST=$(platform_expanded_cpu_list| \ perl -pe 's/(\d+)-(\d+)/join(",",$1..$2)/eg'| \ @@ -43,8 +40,26 @@ if [[ $vswitch_type =~ none ]]; then VSWITCH_CPULIST="" fi -IDLE_MARK=95.0 -KERNEL=$(uname -a) +PIDFILE=/var/run/affine-tasks.sh.pid + +# Idle cpu occupancy threshold; logical cpus with greater idle occupancy +# than this will be included. +IDLEOCC_THRESHOLD=95.0 + +# Watch timeout to monitor removal of flag file; this is engineered as +# 2x the typical duration of a swact. +WATCH_TIMEOUT_SECONDS=90 + +# Log info message to /var/log/daemon.log +NAME="task-affine-functions" +LOG_FILE=/tmp/task-affine-functions.log +function LOG { + logger -p daemon.info -t "${NAME}($$): " "$@" + if [ ! -z "${LOG_FILE}" ]; then + local tstamp_H=$( date +"%Y-%0m-%0eT%H:%M:%S" ) + echo -e "${tstamp_H} ${HOSTNAME} $0($$): info $@" >> ${LOG_FILE} + fi +} ################################################################################ # Check if a given core is one of the platform cores @@ -72,8 +87,22 @@ function is_vswitch_core { return 0 } +################################################################################ +# Check if a given core is one of the isolcpus cores +################################################################################ +function is_isolcpus_core { + local core=$1 + for CPU in ${ISOL_CPULIST}; do + if [ $core -eq $CPU ]; then + return 1 + fi + done + return 0 +} + # Return list of reaffineable pids. This includes all processes, but excludes -# kernel threads, vSwitch, and anything in K8S or qemu/kvm. +# kernel threads, vSwitch, and anything in the cgroup cpusets: k8s-infra, docker, +# and machine.slice (i.e., qemu-kvm). function reaffineable_pids { local pids_excl local pidlist @@ -83,7 +112,7 @@ function reaffineable_pids { sed 's/,$/\n/') pidlist=$(ps --ppid ${pids_excl} -p ${pids_excl} --deselect \ -o pid=,cgroup= | \ - awk '!/k8s-infra|machine.slice/ {print $1; }') + awk '!/k8s-infra|docker|machine.slice/ {print $1; }') echo "${pidlist[@]}" } @@ -93,7 +122,7 @@ function reaffineable_pids { # critical and cpu intensive operation in AIO. For instance, sm can levearage # the idle cores to speed up swact activity. # -# At the end of the operation, regarless of the result, the service must be +# At the end of the operation, regardless of the result, the service must be # calling function affine_tasks_to_platform_cores to re-affine platform tasks # back to their assigned core(s). # @@ -101,61 +130,92 @@ function reaffineable_pids { ################################################################################ function affine_tasks_to_idle_cores { local cpulist - local cpuocc_list local vswitch_pid local pidlist local idle_cpulist local platform_cpus + local count=0 local rc=0 - local cpu=0 + + # Keep the last invocation of affining, truncate when we use idle cores + :> ${LOG_FILE} + + # Ensure this only runs on AIO + if ! { [[ "$nodetype" = "controller" ]] && [[ $subfunction = *worker* ]]; } + then + LOG "Not AIO, nothing to do." + return $rc + fi if [ -f ${TASK_AFFINING_INCOMPLETE} ]; then read cpulist < ${TASK_AFFINING_INCOMPLETE} - log_debug "${TAG} Tasks have already been affined to CPU ($cpulist)." - return 0 + LOG "Tasks have already been affined to CPU ($cpulist)." + return $rc fi - if [[ "${KERNEL}" == *" RT "* ]]; then - return 0 - fi + # Get idle cpu occupancy of all logical cores in the last 5 seconds. + declare -a cpuocc_list=( $(sar -P ALL 1 5 | grep Average | awk '{if(NR>2)print $8}') ) - # Compile a list of cpus with idle percentage greater than 95% in the last - # 5 seconds. - cpuocc_list=($(sar -P ALL 1 5|grep Average|awk '{if(NR>2)print $8}')) - - for idle_value in ${cpuocc_list[@]}; do + # Determine logical cpus that are considered platform, or application + # cores with idle percentage greater than 95%. + declare -a idle_cpus=() + for cpu in ${!cpuocc_list[@]}; do + idleocc=${cpuocc_list[$cpu]} is_vswitch_core $cpu if [ $? -eq 1 ]; then - cpu=$(($cpu+1)) + continue + fi + + is_isolcpus_core $cpu + if [ $? -eq 1 ]; then continue fi is_platform_core $cpu if [ $? -eq 1 ]; then - # Platform core is added to the idle list by default - idle_cpulist=$idle_cpulist$cpu"," + idle_cpus+=( ${cpu} ) else - # Non platform core is added to the idle list if it is more - # than 95% idle - if [[ $(echo "$idle_value > ${IDLE_MARK}"|bc) -eq 1 ]]; then - idle_cpulist=$idle_cpulist$cpu"," + if [[ $(echo "${idleocc} > ${IDLEOCC_THRESHOLD}" | bc) -eq 1 ]]; then + idle_cpus+=( ${cpu} ) fi fi - cpu=$(($cpu+1)) done - idle_cpulist=$(echo $idle_cpulist|sed 's/.$//') + # comma separated list of idle cpus + idle_cpulist=$(printf '%s,' "${idle_cpus[@]}") + idle_cpulist=${idle_cpulist%,} - log_debug "${TAG} Affining all tasks to idle CPU ($idle_cpulist)" + LOG "Affining all tasks to idle CPU ($idle_cpulist)" pidlist=( $(reaffineable_pids) ) for pid in ${pidlist[@]}; do + count=$((${count} + 1)) taskset --all-tasks --pid --cpu-list \ ${idle_cpulist} ${pid} > /dev/null 2>&1 done # Save the cpu list to the temp file which will be read and removed when # tasks are reaffined to the platform cores later on. + # This list is consumed by SM so it knows about extra cores. echo $idle_cpulist > ${TASK_AFFINING_INCOMPLETE} + LOG "Affined ${count} processes to idle cores." + + # Wait for affining flag file to disappear. If the timeout period is reached, + # affine tasks back to platform cores. + watch_start_seconds=${SECONDS} + while [ -f ${TASK_AFFINING_INCOMPLETE} ]; do + elapsed_seconds=$(( ${SECONDS} - ${watch_start_seconds} )) + LOG "Waiting for swact to complete: ${elapsed_seconds} seconds." + if [ ${elapsed_seconds} -ge ${WATCH_TIMEOUT_SECONDS} ]; then + LOG "Exceeded watch timeout: ${WATCH_TIMEOUT_SECONDS} seconds," \ + "affining tasks to platform cores." + affine_tasks_to_platform_cores + LOG "Idle cores watch completed," \ + "tasks reaffined to platform cores." + break + fi + sleep 5 + done + return $rc } @@ -164,24 +224,36 @@ function affine_tasks_to_idle_cores { # to re-affine management tasks back to the platform cores. ################################################################################ function affine_tasks_to_platform_cores { - local cpulist local pidlist local rc=0 local count=0 - if [ ! -f ${TASK_AFFINING_INCOMPLETE} ]; then - dbg_str="${TAG} Either tasks have never been affined to all/idle" - dbg_str="${TAG} cores or they have already been reaffined to" - dbg_str="${TAG} platform cores." - log_debug "$dbg_str" - return 0 + # Ensure this only runs on AIO + if ! { [[ "$nodetype" = "controller" ]] && [[ $subfunction = *worker* ]]; } + then + LOG "Not AIO, nothing to do." + return $rc fi - read cpulist < ${TASK_AFFINING_INCOMPLETE} + # Abort if affine-tasks.sh is running + if [ -e ${PIDFILE} ]; then + pid=$(cat ${PIDFILE}) + if [ -n "${pid}" -a -e /proc/${pid} ]; then + LOG "Aborting, ${pid} already running: ${PIDFILE}." + return $rc + fi + fi - log_debug "${TAG} Reaffining tasks to platform cores (${PLATFORM_CPUS})..." + if [ ! -f ${TASK_AFFINING_INCOMPLETE} ]; then + LOG "Either tasks have never been affined to all/idle cores" \ + "or they have already been reaffined to platform cores." + return $rc + fi + + LOG "Reaffining tasks to platform cores (${PLATFORM_CPUS})..." pidlist=( $(reaffineable_pids) ) for pid in ${pidlist[@]}; do + count=$((${count} + 1)) taskset --all-tasks --pid --cpu-list \ ${PLATFORM_CPUS} ${pid} > /dev/null 2>&1 done @@ -195,39 +267,42 @@ function affine_tasks_to_platform_cores { taskset --pid --cpu-list ${PLATFORM_CPUS} {} > /dev/null 2>&1 done - rm -rf ${TASK_AFFINING_INCOMPLETE} + rm -v -f ${TASK_AFFINING_INCOMPLETE} + LOG "Affined ${count} processes to platform cores." + return $rc } ################################################################################ -# The following function can be leveraged by cron tasks +# The following function returns a single logical cpu with greatest idle +# occupancy. This can be leveraged by cron tasks or other processes. +# (e.g., python-keystone) ################################################################################ function get_most_idle_core { - local cpuocc_list - local cpu=0 - local most_idle_value=${IDLE_MARK} + local most_idle_value=${IDLEOCC_THRESHOLD} local most_idle_cpu=0 - if [[ "${KERNEL}" == *" RT "* ]]; then - echo $cpu - return - fi + declare -a cpuocc_list=( $(sar -P ALL 1 5 | grep Average | awk '{if(NR>2)print $8}') ) - cpuocc_list=($(sar -P ALL 1 5|grep Average|awk '{if(NR>2)print $8}')) - - for idle_value in ${cpuocc_list[@]}; do + for cpu in ${!cpuocc_list[@]}; do + idle_value=${cpuocc_list[$cpu]} is_vswitch_core $cpu if [ $? -eq 1 ]; then - cpu=$(($cpu+1)) continue fi - if [ $(echo "$idle_value > $most_idle_value"|bc) -eq 1 ]; then - most_idle_value=$idle_value - most_idle_cpu=$cpu + is_isolcpus_core $cpu + if [ $? -eq 1 ]; then + continue + fi + + if [ $(echo "${idle_value} > ${most_idle_value}" | bc) -eq 1 ]; then + most_idle_value=${idle_value} + most_idle_cpu=${cpu} fi - cpu=$(($cpu+1)) done - echo $most_idle_cpu + LOG "get_most_idle_core: cpu=$most_idle_cpu, idleocc=$most_idle_value" + echo ${most_idle_cpu} } +