From fc33582e4fdaa4d2725f2d7c23395a3e4c19daba Mon Sep 17 00:00:00 2001 From: Steven Webster Date: Wed, 23 Oct 2019 09:33:39 -0500 Subject: [PATCH] Traffic control fixes and refresh This commit fixes a couple of issues surrounding the main StarlingX traffic control script. The following has been changed: - The name of the script has been changed to remove the 'cgcs_' prefix - Obsolete code involving the (old) infrastructure network and related consolidation with the management interface has been removed. - There is no more class/qdisc for live-migration, since that traffic now runs over the cluster-host network. It should be prioritized via a kubernetes network policy instead. (not in this commit) - The code has been cleaned up to define the handles and qdisc parameters globally so they can be changed once instead of in each tc command. - SM heartbeats are treated as high priority traffic. Previously, we were only treating a tos of 0x10 (low delay) as high priority traffic, while SM uses a DSCP value of 0xc0. - The script now handles IPv6 Closes-Bug: #1839386 Change-Id: I32cd58842865d8c8efe9444b23b05a96f1df168a Signed-off-by: Steven Webster --- tools/collector/scripts/collect_tc.sh | 10 +- .../platform-util/centos/platform-util.spec | 4 +- .../platform-util/scripts/cgcs_tc_setup.sh | 518 ------------------ .../scripts/remotelogging_tc_setup.sh | 4 +- utilities/platform-util/scripts/tc_setup.sh | 503 +++++++++++++++++ 5 files changed, 512 insertions(+), 527 deletions(-) delete mode 100755 utilities/platform-util/scripts/cgcs_tc_setup.sh create mode 100755 utilities/platform-util/scripts/tc_setup.sh diff --git a/tools/collector/scripts/collect_tc.sh b/tools/collector/scripts/collect_tc.sh index 737461a8..95cd0fef 100755 --- a/tools/collector/scripts/collect_tc.sh +++ b/tools/collector/scripts/collect_tc.sh @@ -40,13 +40,13 @@ for i in $(ip link | grep mtu | grep eth |awk '{print $2}' | sed 's#:##g'); do done ############################################################################### -# TC Configuration Script (/usr/local/bin/cgcs_tc_setup.sh) +# TC Configuration Script (/usr/local/bin/tc_setup.sh) ############################################################################### -delimiter ${LOGFILE} "cat /usr/local/bin/cgcs_tc_setup.sh" -if [ -f /usr/local/bin/cgcs_tc_setup.sh ]; then - cat /usr/local/bin/cgcs_tc_setup.sh >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} +delimiter ${LOGFILE} "cat /usr/local/bin/tc_setup.sh" +if [ -f /usr/local/bin/tc_setup.sh ]; then + cat /usr/local/bin/tc_setup.sh >> ${LOGFILE} 2>>${COLLECT_ERROR_LOG} else - echo "/usr/local/bin/cgcs_tc_setup.sh NOT FOUND" >> ${LOGFILE} + echo "/usr/local/bin/tc_setup.sh NOT FOUND" >> ${LOGFILE} fi ############################################################################### diff --git a/utilities/platform-util/centos/platform-util.spec b/utilities/platform-util/centos/platform-util.spec index b0c1fb55..a781a21b 100644 --- a/utilities/platform-util/centos/platform-util.spec +++ b/utilities/platform-util/centos/platform-util.spec @@ -57,7 +57,7 @@ mkdir -p $RPM_BUILD_ROOT/wheels install -m 644 dist/*.whl $RPM_BUILD_ROOT/wheels/ install -d %{buildroot}%{local_bindir} -install %{_buildsubdir}/scripts/cgcs_tc_setup.sh %{buildroot}%{local_bindir} +install %{_buildsubdir}/scripts/tc_setup.sh %{buildroot}%{local_bindir} install %{_buildsubdir}/scripts/remotelogging_tc_setup.sh %{buildroot}%{local_bindir} install %{_buildsubdir}/scripts/connectivity_test %{buildroot}%{local_bindir} install -m 555 %{_buildsubdir}/scripts/update-iso.sh %{buildroot}%{local_bindir} @@ -88,7 +88,7 @@ systemctl enable opt-platform.service %license LICENSE %defattr(-,root,root,-) /usr/bin/verify-license -%{local_bindir}/cgcs_tc_setup.sh +%{local_bindir}/tc_setup.sh %{local_bindir}/remotelogging_tc_setup.sh %{local_bindir}/connectivity_test %{local_sbindir}/patch-restart-mtce diff --git a/utilities/platform-util/scripts/cgcs_tc_setup.sh b/utilities/platform-util/scripts/cgcs_tc_setup.sh deleted file mode 100755 index 58130053..00000000 --- a/utilities/platform-util/scripts/cgcs_tc_setup.sh +++ /dev/null @@ -1,518 +0,0 @@ -#!/bin/sh - -# -# Copyright (c) 2017-2018 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# - -# $1 - interface -# $2 - interface type [mgmt, infra] -# $3 - link capacity -# $4 - dummy used to determine if we're backgrounded or not - -DEV=$1 -NETWORKTYPE=$2 -NETWORKSPEED=$3 - -if [ ${NETWORKTYPE} != "mgmt" -a ${NETWORKTYPE} != "infra" ]; then - exit 0 -fi - -# We want to be able to wait some time (typically <10 sec) for the -# network link to autonegotiate link speed. Re-run the script in -# the background so the parent can return right away and init can -# continue. -if [ $# -eq 3 ]; then - $0 $DEV $NETWORKTYPE $NETWORKSPEED dummy & - disown - exit 0 -fi - -function test_valid_speed { - # After the link is enabled but before the autonegotiation is complete - # the link speed may be read as either -1 or as 4294967295 (which is - # uint(-1) in twos-complement) depending on the kernel. Neither one is valid. - if (( $1 > 0 )) && (( $1 != 4294967295 )) - then - return 0 - else - return 1 - fi -} - -function log { - # It seems that syslog isn't yet running, so append directly to the syslog file - FILE=/var/log/platform.log - echo `date +%FT%T.%3N` `hostname` CGCS_TC_SETUP: $@ >> $FILE -} - -function infra_exists { - if [ -z "$infrastructure_interface" ]; then - return 1 - else - return 0 - fi -} - -function is_consolidated { - if ! infra_exists - then - return 1 - fi - - local INFRA=$infrastructure_interface - local MGMT=$management_interface - - # determine whether the management interface is a parent of the - # infrastructure interface based on name. - # eg. this matches enp0s8 to enp0s8.10 but not enp0s88 - if [[ $INFRA =~ $MGMT[\.][0-9]+$ ]]; then - return 0 - fi - return 1 -} - -function is_vlan { - if [ -f /proc/net/vlan/$DEV ]; then - return 0 - else - return 1 - fi -} - -function is_loopback { - # (from include/uapi/linux/if.h) - # IFF_LOOPBACK = 1<<3 = 8. Using a left shifted syntax can confuse bashate. - IFF_LOOPBACK=8 - - # get the interface flags - FLAGS=`cat /sys/class/net/$DEV/flags` - - if ((($IFF_LOOPBACK & $FLAGS) == 0)) - then - return 1 - else - return 0 - fi -} - -function get_tc_filter_ethertype { - local ETHERTYPE=$DEFAULT_ETHERTYPE - - if is_consolidated - then - if ! is_vlan - then - # If we have a consolidated VLAN interface, we must set the - # protocol to '802.1q' for the underlying Ethernet interface - # to be able to match on IP packets coming from the VLAN - # interface. - ETHERTYPE=802.1q - fi - fi - echo $ETHERTYPE - return 0 -} - -function setup_tc_port_filter { - local PORT=$1 - local PORTMASK=$2 - local FLOWID=$3 - local PROTOCOL=$4 - local PRIORITY=$DEFAULT_PRIORITY - local ETHERTYPE=$DEFAULT_ETHERTYPE - - ETHERTYPE=$(get_tc_filter_ethertype) - - if [ -z $PROTOCOL ]; then - # Apply to TCP and UDP - tc filter add dev $DEV protocol $ETHERTYPE parent 1:0 prio $PRIORITY \ - u32 match ip dport $PORT $PORTMASK flowid $FLOWID - tc filter add dev $DEV protocol $ETHERTYPE parent 1:0 prio $PRIORITY \ - u32 match ip sport $PORT $PORTMASK flowid $FLOWID - else - # Apply to specific protocol only - tc filter add dev $DEV protocol $ETHERTYPE parent 1:0 prio $PRIORITY \ - u32 match ip protocol $PROTOCOL 0xff match \ - ip dport $PORT $PORTMASK flowid $FLOWID - tc filter add dev $DEV protocol $ETHERTYPE parent 1:0 prio $PRIORITY \ - u32 match ip protocol $PROTOCOL 0xff match \ - ip sport $PORT $PORTMASK flowid $FLOWID - fi -} - -function setup_tc_tos_filter { - local TOS=$1 - local TOSMASK=$2 - local FLOWID=$3 - local ETHERTYPE=$4 - local PRIORITY=$5 - - if [ -z $ETHERTYPE ]; then - ETHERTYPE=$DEFAULT_ETHERTYPE - fi - - if [ -z $PRIORITY ]; then - PRIORITY=$DEFAULT_PRIORITY - fi - - tc filter add dev $DEV protocol $ETHERTYPE parent 1:0 prio $PRIORITY \ - u32 match ip tos $TOS $TOSMASK flowid $FLOWID -} - -function setup_root_tc { - # create new qdiscs, classes and queues - tc qdisc add dev $DEV root handle 1: htb default 40 - tc class add dev $DEV parent 1: classid 1:1 htb rate ${SPEED}mbit \ - burst 15k quantum 60000 -} - -function setup_default_tc { - local RATE=$1 - local CEIL=$2 - - local FLOWQ=40 - local CLASSID=1:$FLOWQ - local FLOWID=$CLASSID - - # create default qdiscs, classes - $AC $CLASSID htb rate $((${RATE}*${SPEED}/100))mbit burst 15k \ - ceil $((${CEIL}*${SPEED}/100))mbit prio 4 quantum 60000 - tc qdisc add dev $DEV parent $CLASSID handle $FLOWQ: sfq perturb 10 -} - -function setup_hiprio_tc { - local RATE=$1 - local CEIL=$2 - - local FLOWQ=10 - local CLASSID=1:$FLOWQ - local FLOWID=$CLASSID - local ETHERTYPE=$DEFAULT_ETHERTYPE - ETHERTYPE=$(get_tc_filter_ethertype) - - # create high priority qdiscs, classes, and queues - $AC $CLASSID htb rate $((${RATE}*${SPEED}/100))mbit burst 15k \ - ceil $((${CEIL}*${SPEED}/100))mbit prio 3 quantum 60000 - tc qdisc add dev $DEV parent $CLASSID handle $FLOWQ: sfq perturb 10 - - # filter for high priority traffic - setup_tc_tos_filter 0x10 0xf8 $FLOWID $ETHERTYPE - - if [ "$ETHERTYPE" != "$DEFAULT_ETHERTYPE" ]; then - # For the 'hiprio' class, a second filter at a different priority is - # needed in this case to match traffic with the default ethertype. - # (ie. high priority management traffic). - local PRIORITY - PRIORITY=$(($DEFAULT_PRIORITY + 1)) - setup_tc_tos_filter 0x10 0xf8 $FLOWID $DEFAULT_ETHERTYPE $PRIORITY - fi -} - -function setup_migration_tc { - local RATE=$1 - local CEIL=$2 - - local FLOWQ=30 - local CLASSID=1:$FLOWQ - local FLOWID=$CLASSID - - # create migration qdiscs, classes, and queues - $AC $CLASSID htb rate $((${RATE}*${SPEED}/100))mbit burst 15k \ - ceil $((${CEIL}*${SPEED}/100))mbit prio 2 quantum 60000 - tc qdisc add dev $DEV parent $CLASSID handle $FLOWQ: sfq perturb 10 - - # Migration (TCP, ports 49152-49215) - setup_tc_port_filter 49152 0xffc0 $FLOWID $TCP - - # Migration via libvirt tunnel (TCP, port 16509) - setup_tc_port_filter 16509 0xffff $FLOWID $TCP -} - -function setup_storage_tc { - local RATE=$1 - local CEIL=$2 - - local FLOWQ=20 - local CLASSID=1:$FLOWQ - local FLOWID=$CLASSID - - # create storage qdiscs, classes, and queues - $AC $CLASSID htb rate $((${RATE}*${SPEED}/100))mbit burst 15k \ - ceil $((${CEIL}*${SPEED}/100))mbit prio 1 quantum 60000 - tc qdisc add dev $DEV parent $CLASSID handle $FLOWQ: sfq perturb 10 - - # Storage, NFS (UDP/TCP, port 2049) - setup_tc_port_filter 2049 0xffff $FLOWID - - # Storage, iSCSI (UDP/TCP, port 3260) - setup_tc_port_filter 3260 0xffff $FLOWID - - # Storage, CEPH (TCP, ports 6789,6800-7100) - PORTS=( 6789 6800 6816 6912 7040 7072 7088 ) - PORTMASKS=( 0xffff 0xfff0 0xffa0 0xff80 0xffa0 0xfff0 0xfffa ) - for idx in "${!PORTS[@]}"; do - PORT=${PORTS[$idx]} - MASK=${PORTMASKS[$idx]} - setup_tc_port_filter $PORT $MASK $FLOWID $TCP - done -} - -function setup_drbd_tc { - local RATE=$1 - local CEIL=$2 - - local FLOWQ=50 - local CLASSID=1:$FLOWQ - local FLOWID=$CLASSID - - # create DRBD qdiscs, classes and queues - $AC $CLASSID htb rate $((${RATE}*${SPEED}/100))mbit burst 15k \ - ceil $((${CEIL}*${SPEED}/100))mbit quantum 60000 - - tc qdisc add dev $DEV parent $CLASSID handle $FLOWQ: sfq perturb 10 - - # DRDB (TCP, ports 7789,7790,7791,7799) - # port 7793 is used with drdb-extension - PORTS=( 7789 7790 7791 7792 7799 7793 ) - PORTMASKS=( 0xffff 0xffff 0xffff 0xffff 0xffff 0xffff) - for idx in "${!PORTS[@]}"; do - PORT=${PORTS[$idx]} - MASK=${PORTMASKS[$idx]} - setup_tc_port_filter $PORT $MASK $FLOWID $TCP - done -} - -function setup_mgmt_tc_individual { - # Configure high priority and default traffic classes. - - setup_root_tc - - # bandwidth percentages - local HIPRIO_BW=10 - local DEFAULT_BW=10 - - # bandwidth ceiling percentages, for borrowing bandwidth. - # the management interface is not consolidated, so set the ceiling to the - # maximum rate. - local HIPRIO_CBW=100 - local DEFAULT_CBW=100 - - setup_hiprio_tc $HIPRIO_BW $HIPRIO_CBW - setup_default_tc $DEFAULT_BW $DEFAULT_CBW -} - - -function setup_mgmt_tc_vlan { - # Configure high priority and default traffic classes. - - setup_root_tc - - # bandwidth percentages - local HIPRIO_BW=10 - local DEFAULT_BW=10 - - # bandwidth ceiling percentages, for borrowing bandwidth. - # The management interface is a vlan, so reserve bandwidth - # for sibling infra vlan interfaces. - local HIPRIO_CBW=20 - local DEFAULT_CBW=20 - - setup_hiprio_tc $HIPRIO_BW $HIPRIO_CBW - setup_default_tc $DEFAULT_BW $DEFAULT_CBW -} - -function setup_mgmt_tc_consolidated { - # Configure management classes. - # All traffic coming from the infra will get treated again by the - # management traffic classes. We need to apply the same TCs as the - # infra to prevent a management application from starving the - # upper interface. - setup_root_tc - setup_tc_all -} - -function setup_mgmt_tc_infra_exists { - if is_consolidated - then - # Infra over mgmt. In this case we want to reserve - # a small portion of the link for management. - setup_mgmt_tc_consolidated - else - # Only setup hiprio and default classes. - # The infra will handle storage, migration, DRBD. - if is_vlan - then - setup_mgmt_tc_vlan - else - setup_mgmt_tc_individual - fi - fi -} - -function setup_mgmt_tc_no_infra { - # Configure traffic classes for a management interface when - # no infrastructure interface exists. Configure the full - # set of TCs. - - setup_root_tc - setup_tc_all -} - -function setup_infra_tc_consolidated { - # Configure the full set of traffic classes, but leave a small - # portion of bandwidth for the management interface. - - # reserve 1% BW for management - local RESERVED - RESERVED=$((1*${SPEED}/100)) - SPEED=$((${SPEED}-${RESERVED})) - - setup_root_tc - setup_tc_all -} - -function setup_infra_tc_individual { - # Configure the full set of traffic classes. - - setup_root_tc - if is_vlan - then - # reserve 1% BW for sibling vlan interfaces - local RESERVED - RESERVED=$((1*${SPEED}/100)) - SPEED=$((${SPEED}-${RESERVED})) - fi - setup_tc_all -} - -function setup_tc_all { - # bandwidth percentages, in case of over-percentage, bandwidth is divided based - # on bandwidth ratios - local MIG_BW=30 - local STOR_BW=50 - local DRBD_BW=80 - local HIPRIO_BW=10 - local DEFAULT_BW=10 - - # bandwidth ceiling percentages, for borrowing bandwidth - local MIG_CBW=100 - local STOR_CBW=100 - local DRBD_CBW=100 - local HIPRIO_CBW=20 - local DEFAULT_CBW=20 - - setup_hiprio_tc $HIPRIO_BW $HIPRIO_CBW - setup_storage_tc $STOR_BW $STOR_CBW - setup_migration_tc $MIG_BW $MIG_CBW - setup_default_tc $DEFAULT_BW $DEFAULT_CBW - if [ $nodetype == "controller" ]; then - setup_drbd_tc $DRBD_BW $DRBD_CBW - fi -} - -function get_dev_speed { - # If the link doesn't come up we won't go enabled, so here we can - # afford to wait forever for the link. - while true; do - if [ -e /sys/class/net/$1/bonding ]; then - for VAL in `cat /sys/class/net/$1/lower_*/speed`; do - if test_valid_speed $VAL; then - log slave for bond link $1 reported speed $VAL - echo $VAL - return 0 - else - log slave for bond link $1 reported invalid speed $VAL - fi - done - log all slaves for bond link $1 reported invalid speeds, \ - will sleep 30 sec and try again - else - VAL=`cat /sys/class/net/$1/speed` - if test_valid_speed $VAL; then - log link $1 reported speed $VAL - echo $VAL - return 0 - else - log link $1 returned invalid speed $VAL, \ - will sleep 30 sec and try again - fi - fi - sleep 30 - done -} - -function get_speed { - local dev=$1 - local networktype=$2 - local net_speed=$NETWORKSPEED - local dev_speed - dev_speed=$(get_dev_speed $DEV) - local speed=$dev_speed - if [ $net_speed != $dev_speed ]; then - log WARNING: $dev has a different operational speed [$dev_speed] \ - than configured speed [$net_speed] for network type $networktype - if test_valid_speed $net_speed; then - # Use greater of configured net speed / recorded dev speed - if [ $net_speed -gt $dev_speed ]; then - speed=$net_speed - fi - fi - fi - log using speed $speed for tc filtering on $dev - echo $speed -} - - -if is_loopback -then - # mgmt/infra uses the loopback for CPE simplex - exit 0 -fi - -log running tc setup script for $DEV $NETWORKTYPE in background - -if [ -f /etc/platform/platform.conf ]; then - source /etc/platform/platform.conf -fi - -SPEED=$(get_speed $DEV $NETWORKTYPE) - -# 1:10 = high priority class -# 1:20 = storage class -# 1:30 = migration class -# 1:40 = default class -# 1:50 = DRBD class - -# generic class add preamble -AC="tc class add dev $DEV parent 1:1 classid" - -# protocol numbers -TCP=6 -UDP=17 - -# default ethertype for filters -DEFAULT_ETHERTYPE=ip - -# default priority for filters -DEFAULT_PRIORITY=1 - -# delete existing qdiscs -tc qdisc del dev $DEV root > /dev/null 2>&1 - -if [ ${NETWORKTYPE} = "mgmt" ]; then - if infra_exists - then - setup_mgmt_tc_infra_exists - else - setup_mgmt_tc_no_infra - fi -else - if is_consolidated - then - setup_infra_tc_consolidated - else - setup_infra_tc_individual - fi -fi diff --git a/utilities/platform-util/scripts/remotelogging_tc_setup.sh b/utilities/platform-util/scripts/remotelogging_tc_setup.sh index 7b7cf903..855eac32 100755 --- a/utilities/platform-util/scripts/remotelogging_tc_setup.sh +++ b/utilities/platform-util/scripts/remotelogging_tc_setup.sh @@ -30,7 +30,7 @@ function is_loopback { function log { # It seems that syslog isn't yet running, so append directly to the syslog file local FILE=/var/log/platform.log - echo `date +%FT%T.%3N` `hostname` CGCS_TC_SETUP: $@ >> $FILE + echo `date +%FT%T.%3N` `hostname` TC_SETUP: $@ >> $FILE } function test_valid_speed { @@ -91,7 +91,7 @@ LOG_BW=9 DEFAULT_CBW=20 LOG_CBW=20 -# 1:40 = default class from cgcs_tc_setup.sh +# 1:40 = default class from tc_setup.sh # 1:60 = LOG class if [ $nodetype == "controller" ]; then diff --git a/utilities/platform-util/scripts/tc_setup.sh b/utilities/platform-util/scripts/tc_setup.sh new file mode 100755 index 00000000..1a2c4561 --- /dev/null +++ b/utilities/platform-util/scripts/tc_setup.sh @@ -0,0 +1,503 @@ +#!/bin/sh + +# +# Copyright (c) 2017-2019 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +# $1 - interface +# $2 - interface type [mgmt] +# $3 - link capacity +# $4 - dummy used to determine if we're backgrounded or not + +DEV=$1 +NETWORKTYPE=$2 +NETWORKSPEED=$3 + +# log file +LOG_FILE=/var/log/platform.log + +# default ethertype for filters +DEFAULT_FILTER_ETHERTYPE=ip + +# default match protocol for filters +DEFAULT_FILTER_MATCH_PROTOCOL=ip + +# default priority for filters +DEFAULT_FILTER_PRIORITY=1 + +# default HTB class quantum (borrowing amount) in bytes +DEFAULT_HTB_QUANTUM=60000 + +# default HTB class burst (amount of bytes that can be burst at ceil speed) +DEFAULT_HTB_BURST=15k + +# default SFQ algorithm pertubation in seconds (recommended 10) +DEFAULT_SFQ_PERTUBATION=10 + +# major handle for the root qdisc / class. +# All objects in the same traffic control structure must share a major handle +# number. Conventionally, numbering schemes start at 1 for objects attached +# directly to the root qdisc. +ROOT_HANDLE_MAJOR=1 + +# minor handle for the root class. +ROOT_HANDLE_MINOR=1 + +# root qdisc id. The minor number namespace is left available for classes +ROOT_QDISC_ID="${ROOT_HANDLE_MAJOR}:" + +# root class id. The root class id is typically 1:1 +ROOT_CLASS_ID="${ROOT_HANDLE_MAJOR}:${ROOT_HANDLE_MINOR}" + +# minor handle for a qdisc. Unambiguously identifies an object as a qdisc +QDISC_HANDLE_MINOR=0 + +# RFC 2474 class selector codepoints (DCSP) +IPTOS_CLASS_CS0=0x00 +IPTOS_CLASS_CS1=0x20 +IPTOS_CLASS_CS2=0x40 +IPTOS_CLASS_CS3=0x60 +IPTOS_CLASS_CS4=0x80 +IPTOS_CLASS_CS5=0xa0 +IPTOS_CLASS_CS6=0xc0 +IPTOS_CLASS_CS7=0xe0 + +# protocol numbers +IPPROTO_TCP=6 +IPPROTO_UDP=17 + +# relative guaranteed bandwidth percentages for traffic classes +# in case of over-percentage, bandwidth is divided based on bandwidth ratios +BW_PCT_DEFAULT=90 +BW_PCT_HIPRIO=10 + +# ceiling percentages for traffic classes +CEIL_PCT_DEFAULT=100 +CEIL_PCT_HIPRIO=20 + +# class priority for traffic classes (lower = higher priority) +CLASS_PRIORITY_DEFAULT=4 +CLASS_PRIORITY_HIPRIO=${DEFAULT_FILTER_PRIORITY} + +# arbitrary flow id for traffic classes +FLOWID_DEFAULT=40 +FLOWID_HIPRIO=10 + +# Network types to apply traffic controls on +VALID_NETWORKTYPES=("mgmt") + +function log { + echo `date +%FT%T.%3N` `hostname` TC_SETUP: $@ >> ${LOG_FILE} +} + +## sanity check the given speed +function test_valid_speed { + # After the link is enabled but before the autonegotiation is complete + # the link speed may be read as either -1 or as 4294967295 (which is + # uint(-1) in twos-complement) depending on the kernel. Neither one is + # valid. + if (( $1 > 0 )) && (( $1 != 4294967295 )) + then + return 0 + else + return 1 + fi +} + +## determine the negotiated speed for a given interface +function get_dev_speed { + # If the link doesn't come up we won't go enabled, so here we can + # afford to wait forever for the link. + while true; do + if [ -e /sys/class/net/$1/bonding ]; then + for VAL in `cat /sys/class/net/$1/lower_*/speed`; do + if test_valid_speed ${VAL}; then + log slave for bond link $1 reported speed ${VAL} + echo ${VAL} + return 0 + else + log slave for bond link $1 reported invalid speed ${VAL} + fi + done + log all slaves for bond link $1 reported invalid speeds, \ + will sleep 30 sec and try again + else + VAL=`cat /sys/class/net/$1/speed` + if test_valid_speed ${VAL}; then + log link $1 reported speed ${VAL} + echo ${VAL} + return 0 + else + log link $1 returned invalid speed ${VAL}, \ + will sleep 30 sec and try again + fi + fi + sleep 30 + done +} + +## Determines the maximum speed (Mbps) that should be used in traffic control +## rate / ceiling calculations +function get_speed { + local dev=$1 + local networktype=$2 + local net_speed=${NETWORKSPEED} + local dev_speed=$(get_dev_speed ${dev}) + local speed=${dev_speed} + + if [ ${net_speed} != ${dev_speed} ]; then + log WARNING: ${dev} has a different operational speed [${dev_speed}] \ + than configured speed [${net_speed}] for network type ${networktype} + if test_valid_speed ${net_speed}; then + # Use greater of configured net speed / recorded dev speed + if [ ${net_speed} -gt ${dev_speed} ]; then + speed=${net_speed} + fi + fi + fi + log using speed ${speed} for tc filtering on ${dev} + echo ${speed} +} + +## Determines whether a device is a loopback interface +function is_loopback { + local DEVICE=$1 + + # (from include/uapi/linux/if.h) + # IFF_LOOPBACK = 1<<3 = 8. Using a left shifted syntax can confuse bashate. + IFF_LOOPBACK=8 + + # get the interface flags + FLAGS=`cat /sys/class/net/${DEVICE}/flags` + + if (((${IFF_LOOPBACK} & ${FLAGS}) == 0)) + then + return 1 + else + return 0 + fi +} + +## Determines whether the network type requires traffic controls +function is_valid_networktype { + local NETTYPE=$1 + + for nt in ${VALID_NETWORKTYPES}; do + if [ "${NETTYPE}" == ${nt} ]; then + return 0 + fi + done + return 1 +} + +## Determines whether the given device is a vlan interface +function is_vlan { + local DEVICE=$1 + if [ -f /proc/net/vlan/${DEVICE} ]; then + return 0 + else + return 1 + fi +} + +## Delete existing classes, qdiscs, and filters +function delete_tcs { + local DEVICE=$1 + + # Deleting the root qdisc will also delete all underlying + # classes, qdiscs and filters + tc qdisc del dev ${DEVICE} root > /dev/null 2>&1 +} + +## Create the root qdisc and class +function setup_root_tc { + local DEVICE=$1 + local RATE=$2 + + local QDISC_TYPE="htb" + local CLASS_TYPE="htb" + + tc qdisc add dev ${DEVICE} root handle ${ROOT_QDISC_ID} ${QDISC_TYPE} \ + default ${FLOWID_DEFAULT} + + tc class add dev ${DEVICE} parent ${ROOT_QDISC_ID} \ + classid ${ROOT_CLASS_ID} \ + ${CLASS_TYPE} \ + rate ${RATE}mbit \ + burst ${DEFAULT_HTB_BURST} \ + quantum ${DEFAULT_HTB_QUANTUM} +} + +## Create classes and qdiscs for default (unfiltered) traffic +function setup_default_tc { + local DEVICE=$1 + local MAXSPEED=$2 + local MIN_BW_PCT=$3 + local MAX_BW_PCT=$4 + local RATE=$((${MIN_BW_PCT}*${MAXSPEED}/100)) + local CEIL=$((${MAX_BW_PCT}*${MAXSPEED}/100)) + local CLASS_TYPE="htb" + local QDISC_TYPE="sfq" + local QDISC_ID="${FLOWID_DEFAULT}:" + + # associate the objects with the root qdisc/class + local CLASS_ID=${ROOT_HANDLE_MAJOR}:${FLOWID_DEFAULT} + local QDISC_PARENT=${ROOT_CLASS_ID} + + tc class add dev ${DEVICE} parent ${QDISC_PARENT} classid ${CLASS_ID} \ + ${CLASS_TYPE} \ + rate ${RATE}mbit \ + burst ${DEFAULT_HTB_BURST} \ + ceil ${CEIL}mbit \ + prio ${CLASS_PRIORITY_DEFAULT} \ + quantum ${DEFAULT_HTB_QUANTUM} + + tc qdisc add dev ${DEVICE} parent ${CLASS_ID} handle ${QDISC_ID} \ + ${QDISC_TYPE} \ + perturb ${DEFAULT_SFQ_PERTUBATION} +} + +## Get the match parameters to filter on TOS/DSCP +function get_tc_tos_match { + local IP_VERSION=$1 + local TOS=$2 + + # 6 bits DSCP + local TOSMASK=0xfc + + if [ ${IP_VERSION} == 6 ]; then + L3PROTO="ip6" + TOS_FIELD="priority" + else + L3PROTO="ip" + TOS_FIELD="tos" + fi + + echo "match ${L3PROTO} ${TOS_FIELD} ${TOS} ${TOSMASK}" +} + +## Get the match parameters to filter on a L4 protocol +function get_tc_l4_protocol_match { + local IP_VERSION=$1 + local L4PROTOCOL=$2 + + # 8 bits protocol + local PROTOCOLMASK=0xff + + if [ ${IP_VERSION} == 6 ]; then + L3PROTO="ip6" + else + L3PROTO="ip" + fi + + echo "match ${L3PROTO} protocol ${L4PROTOCOL} ${PROTOCOLMASK}" +} + +## Get the match parameters to filter on a port range +function get_tc_port_match { + local IP_VERSION=$1 + local PORT=$2 + local PORTMASK=$3 + local TYPE=${4:-"src"} + + if [ ${TYPE} == "src" ]; then + TYPE="sport" + else + TYPE="dport" + fi + + if [ ${IP_VERSION} == 6 ]; then + L3PROTO="ip6" + else + L3PROTO="ip" + fi + + echo "match ${L3PROTO} ${TYPE} ${PORT} ${PORTMASK}" +} + +## Get the l2/l3 protocol +function get_tc_protocol { + local IP_VERSION=$1 + local ETHERTYPE=$2 + local PROTOCOL=${ETHERTYPE} + + if [ -z ${PROTOCOL} ]; then + # If the ethertype was not explicitly specified, infer it + # from the IP version + if [ ${IP_VERSION} == 6 ]; then + PROTOCOL="ipv6" + else + PROTOCOL="ip" + fi + fi + + echo "${PROTOCOL}" +} + +## Create a filter to deliver system maintenance heartbeats to the high +## priority class +function setup_tc_sm_filter { + local DEVICE=$1 + local FLOWID=$2 + local ETHERTYPE=$3 + local PRIORITY=${DEFAULT_FILTER_PRIORITY} + + # Setup filters for both IPv4 and IPv6 + local IP_VERSIONS=(4 6) + + # SM uses UDP over ports 2222-2223 with a TOS of 12 + local SM_PORT=2222 + local SM_PORTMASK=0xfffe + local SM_PORTTYPE="dst" + local SM_TOS=${IPTOS_CLASS_CS6} + local SM_PROTO=${IPPROTO_UDP} + + # specifies attaching the filter to the root qdisc + local QDISC_ID=${ROOT_HANDLE_MAJOR}:${QDISC_HANDLE_MINOR} + + for idx in "${!IP_VERSIONS[@]}"; do + IP_VERSION=${IP_VERSIONS[$idx]} + + local PROTOCOL=$(get_tc_protocol ${IP_VERSION} ${ETHERTYPE}) + local TOS_MATCH=$(get_tc_tos_match ${IP_VERSION} ${SM_TOS}) + local PROTO_MATCH=$(get_tc_l4_protocol_match ${IP_VERSION} ${SM_PROTO}) + local PORT_MATCH=$(get_tc_port_match \ + ${IP_VERSION} ${SM_PORT} ${SM_PORTMASK} ${SM_PORTTYPE}) + local MATCH_PARAMS="${TOS_MATCH} ${PROTO_MATCH} ${PORT_MATCH}" + + tc filter add dev ${DEVICE} protocol ${PROTOCOL} parent ${QDISC_ID} \ + prio ${PRIORITY} u32 ${MATCH_PARAMS} flowid ${FLOWID} + + PRIORITY=$(($PRIORITY+1)) + done +} + +function setup_tc_port_filter { + local DEVICE=$1 + local FLOWID=$2 + local ETHERTYPE=$3 + local PORT=$4 + local PORTMASK=$5 + local L4PROTOCOL=$6 + local PRIORITY=${DEFAULT_FILTER_PRIORITY} + + # Setup filters for both IPv4 and IPv6 + local IP_VERSIONS=(4 6) + + # Setup filters for both sport and dport + local PORT_TYPES=("src" "dst") + + # specifies attaching the filter to the root qdisc + local QDISC_ID=${ROOT_HANDLE_MAJOR}:${QDISC_HANDLE_MINOR} + + for i in "${!IP_VERSIONS[@]}"; do + local IP_VERSION=${IP_VERSIONS[$i]} + local PROTOCOL=$(get_tc_protocol ${IP_VERSION} ${ETHERTYPE}) + + for j in "${!PORT_TYPES[@]}"; do + + local PORT_MATCH=$(get_tc_port_match \ + ${IP_VERSION} ${PORT} ${PORTMASK} ${PORT_TYPES[$j]}) + + if [ -z $L4PROTOCOL ]; then + # Apply to TCP and UDP + tc filter add dev ${DEVICE} protocol ${PROTOCOL} \ + parent ${QDISC_ID} prio ${PRIORITY} u32 ${PORT_MATCH} \ + flowid ${FLOWID} + else + # Apply to specific protocol only + local PROTO_MATCH=$(get_tc_l4_protocol_match \ + ${IP_VERSION} ${L4PROTOCOL}) + tc filter add dev ${DEVICE} protocol ${PROTOCOL} \ + parent ${QDISC_ID} prio ${PRIORITY} u32 ${PROTO_MATCH} \ + ${PORT_MATCH} flowid ${FLOWID} + fi + done + PRIORITY=$(($PRIORITY+1)) + done +} + +## Create classes, qdiscs, and filters for high priority traffic +function setup_hiprio_tc { + local DEVICE=$1 + local MAXSPEED=$2 + local MIN_BW_PCT=$3 + local MAX_BW_PCT=$4 + local ETHERTYPE=$5 + local MATCH_PROTOCOL=$6 + local RATE=$((${MIN_BW_PCT}*${MAXSPEED}/100)) + local CEIL=$((${MAX_BW_PCT}*${MAXSPEED}/100)) + local CLASS_TYPE="htb" + local QDISC_ID="${FLOWID_HIPRIO}:" + local QDISC_TYPE="sfq" + + # associate the objects with the root qdisc/class + local CLASS_ID=${ROOT_HANDLE_MAJOR}:${FLOWID_HIPRIO} + local QDISC_PARENT=${ROOT_CLASS_ID} + local FLOWID=${CLASS_ID} + + tc class add dev ${DEVICE} parent ${QDISC_PARENT} classid ${CLASS_ID} \ + ${CLASS_TYPE} \ + rate ${RATE}mbit \ + burst ${DEFAULT_HTB_BURST} \ + ceil ${CEIL}mbit \ + prio ${CLASS_PRIORITY_HIPRIO} \ + quantum ${DEFAULT_HTB_QUANTUM} + + tc qdisc add dev ${DEVICE} parent ${CLASS_ID} handle ${QDISC_ID} \ + ${QDISC_TYPE} \ + perturb ${DEFAULT_SFQ_PERTUBATION} + + # Treat system maintenance heartbeats as high priority traffic + setup_tc_sm_filter "${DEVICE}" "${FLOWID}" "${ETHERTYPE}" +} + +if ! is_valid_networktype $NETWORKTYPE; then + exit 0 +fi + +if is_loopback $DEV; then + # Don't setup traffic classes for a loopback device (ie simplex system) + exit 0 +fi + +# We want to be able to wait some time (typically <10 sec) for the +# network link to autonegotiate link speed. Re-run the script in +# the background so the parent can return right away and init can +# continue. +if [ $# -eq 3 ]; then + $0 ${DEV} ${NETWORKTYPE} ${NETWORKSPEED} dummy & + disown + exit 0 +fi + +log running tc setup script for ${DEV} ${NETWORKTYPE} in background + +if [ -f /etc/platform/platform.conf ]; then + source /etc/platform/platform.conf +fi + +SPEED=$(get_speed ${DEV} ${NETWORKTYPE}) + +delete_tcs ${DEV} +setup_root_tc ${DEV} ${SPEED} +setup_default_tc ${DEV} ${SPEED} ${BW_PCT_DEFAULT} ${CEIL_PCT_DEFAULT} +setup_hiprio_tc ${DEV} ${SPEED} ${BW_PCT_HIPRIO} ${CEIL_PCT_HIPRIO} + +if is_vlan ${DEV}; then + if [ -e /sys/class/net/${DEV}/lower_* ]; then + for LOWER in `basename $(readlink /sys/class/net/${DEV}/lower_*)`; do + # In the case of a vlan interface, reserve bandwidth for high + # priority traffic on the underlying interface. + delete_tcs ${LOWER} + setup_root_tc ${LOWER} ${SPEED} + setup_default_tc ${LOWER} ${SPEED} $((100-${BW_PCT_HIPRIO})) 100 + setup_hiprio_tc ${LOWER} ${SPEED} ${BW_PCT_HIPRIO} \ + ${CEIL_PCT_HIPRIO} "802.1q" + done + fi +fi + +