integ/tools/engtools/hostdata-collectors/scripts/collect-engtools.sh
Tee Ngo 610a9282c9 Various engtools fixes
- Change engtools init order to ensure that stats streaming agents on
  the compute and storage nodes do not
  start prematurely after DOR
- Workaround a systemd preun scriptlet issue that caused patch removal
  failure
- Stream database stats in batches (max 10 DBs/batch)
- Account for new processes

Story: 2002895
Task: 22858

Change-Id: Iaeeca7f51b442c27fc475777abc612d53dc97ce5
Signed-off-by: Jack Ding <jack.ding@windriver.com>
Signed-off-by: Scott Little <scott.little@windriver.com>
2018-08-01 12:42:02 -04:00

341 lines
11 KiB
Bash

#!/bin/bash
# Usage:
# collect-engtools.sh [-f] [-p <period_mins>] [-i <interval_seconds>] [-c <cpulist>] [-h]
# Define common utility functions
TOOLBIN=$(dirname $0)
. ${TOOLBIN}/engtools_util.sh
# ENABLE DEBUG (0=disable, 1=enable)
OPT_DEBUG=0
# Set options for long soak (vs, shorter collection)
#OPT_SOAK=0 # long soak
OPT_SOAK=1 # few hour soak
#OPT_SOAK=2 # < hour soak
# Define command to set nice + ionice
CMD_IDLE=$( cmd_idle_priority )
# Purge configuration options
# - how much data may be created per cycle
PURGE_HEADROOM_MB=100
# - how much remaining space to leave
PURGE_HEADROOM_PERCENT=15
# - maximum size of data collection
PURGE_MAXUSAGE_MB=1000
# Affine to pinned cores
AFFINE_PINNED=1
# Line-buffer stream output (instead of buffered)
STDBUF="stdbuf -oL"
# Define some common durations
DUR_60MIN_IN_SEC=$[60*60]
DUR_30MIN_IN_SEC=$[30*60]
DUR_15MIN_IN_SEC=$[15*60]
DUR_10MIN_IN_SEC=$[10*60]
DUR_5MIN_IN_SEC=$[5*60]
DUR_1MIN_IN_SEC=$[1*60]
# Global variables
declare -a parallel_outfiles
declare df_size_bytes
declare df_avail_bytes
declare du_used_bytes
declare tgt_avail_bytes
declare tgt_used_bytes
# do_parallel_commands - launch parallel tools with separate output files
function do_parallel_commands()
{
parallel_outfiles=()
for elem in "${tlist[@]}"
do
tool=""; period=""; repeat=""; interval=""
my_hash="elem[*]"
local ${!my_hash}
if [ ! -z "${name}" ]; then
fname="${TOOL_DEST_DIR}/${HOSTNAME}_${timestamp}_${name}"
parallel_outfiles+=( $fname )
LOG "collecting ${tool}, ${interval} second intervals, to: ${fname}"
if [ ! -z "${period}" ]; then
${STDBUF} ${tool} -p ${period} -i ${interval} > ${fname} 2>/dev/null &
elif [ ! -z "${repeat}" ]; then
${STDBUF} ${tool} --repeat=${repeat} --delay=${interval} > ${fname} 2>/dev/null &
fi
else
# run without file output (eg., ticker)
${STDBUF} ${tool} -p ${period} -i ${interval} 2>/dev/null &
fi
done
}
# get_current_avail_usage() - get output destination file-system usage and
# availability.
# - updates: df_size_bytes, df_avail_bytes, du_used_bytes
function get_current_avail_usage()
{
local -a df_arr_bytes=( $(df -P --block-size=1 ${TOOL_DEST_DIR} | awk 'NR==2 {print $2, $4}') )
df_size_bytes=${df_arr_bytes[0]}
df_avail_bytes=${df_arr_bytes[1]}
du_used_bytes=$(du --block-size=1 ${TOOL_DEST_DIR} | awk 'NR==1 {print $1}')
}
# purge_oldest_files() - remove oldest files based on file-system available space,
# and maximum collection size
function purge_oldest_files()
{
# get current file-system usage
get_current_avail_usage
msg=$(printf "avail %d MB, headroom %d MB; used %d MB, max %d MB" \
$[$df_avail_bytes/1024/1024] $[$tgt_avail_bytes/1024/1024] \
$[$du_used_bytes/1024/1024] $[$tgt_used_bytes/1024/1024])
LOG "usage: ${msg}"
if [[ $df_avail_bytes -lt $tgt_avail_bytes ]] || \
[[ $du_used_bytes -gt $tgt_used_bytes ]]; then
# wait for compression to complete
wait
get_current_avail_usage
if [[ $df_avail_bytes -lt $tgt_avail_bytes ]]; then
msg=$(printf "purge: avail %d MB < target %d MB" \
$[$df_avail_bytes/1024/1024] $[$tgt_avail_bytes/1024/1024] )
LOG "purge: ${msg}"
fi
if [[ $du_used_bytes -gt $tgt_used_bytes ]]; then
msg=$(printf "purge: used %d MB > target %d MB" \
$[$du_used_bytes/1024/1024] $[$tgt_used_bytes/1024/1024] )
LOG "purge: ${msg}"
fi
else
return
fi
# remove files in oldest time sorted order until we meet usage targets,
# incrementally updating usage as we remve files
for file in $( ls -rt ${TOOL_DEST_DIR}/${HOSTNAME}_* 2>/dev/null )
do
if [[ $df_avail_bytes -ge $tgt_avail_bytes ]] && \
[[ $du_used_bytes -le $tgt_used_bytes ]]; then
break
fi
if [ ${OPT_DEBUG} -eq 1 ]; then
msg="purge: file=$file"
if [[ $df_avail_bytes -lt $tgt_avail_bytes ]]; then
msg="${msg}, < AVAIL"
fi
if [[ $du_used_bytes -gt $tgt_used_bytes ]]; then
msg="${msg}, > MAXUSAGE"
fi
LOG "${msg}"
fi
sz_bytes=$(stat --printf="%s" $file)
((df_avail_bytes += sz_bytes))
((du_used_bytes -= sz_bytes))
rm -fv ${file}
done
}
#-------------------------------------------------------------------------------
# MAIN Program:
#-------------------------------------------------------------------------------
# Read configuration variable file if it is present
NAME=collect-engtools.sh
[ -r /etc/default/$NAME ] && . /etc/default/$NAME
# Initialize tool
tools_init
# Parse input options
tools_parse_options "${@}"
# Set affinity of current script
CPULIST=""
# Affine tools to NOVA pinned cores (i.e., non-cpu 0)
# - remove interference with cpu 0
if [ "${AFFINE_PINNED}" -eq 1 ]; then
NOVA_CONF=/etc/nova/compute_extend.conf
if [ -f "${NOVA_CONF}" ]; then
source "${NOVA_CONF}"
CPULIST=${compute_pinned_cpulist}
else
CPULIST=""
fi
fi
set_affinity ${CPULIST}
# Define output directory
if [[ "${HOSTNAME}" =~ "controller-" ]]; then
TOOL_DEST_DIR=/scratch/syseng_data/${HOSTNAME}
elif [[ "${HOSTNAME}" =~ "compute-" ]]; then
TOOL_DEST_DIR=/tmp/syseng_data/${HOSTNAME}
else
TOOL_DEST_DIR=/tmp/syseng_data/${HOSTNAME}
fi
mkdir -p ${TOOL_DEST_DIR}
# Define daemon log output
timestamp=$( date +"%Y-%0m-%0e_%H%M" )
DAEMON_OUT="${TOOL_DEST_DIR}/${HOSTNAME}_${timestamp}_${TOOLNAME}.log"
# Redirect stdout and append to log if not connected to TTY
if test ! -t 1 ; then
exec 1>> ${DAEMON_OUT}
fi
# Get current availability and usage
get_current_avail_usage
# Calculate disk usage and availability purge targets
df_offset_bytes=$[$PURGE_HEADROOM_MB*1024*1024]
tgt_used_bytes=$[$PURGE_MAXUSAGE_MB*1024*1024]
((tgt_avail_bytes = df_size_bytes/100*PURGE_HEADROOM_PERCENT + df_offset_bytes))
# Set granularity based on duration
if [ $PERIOD_MIN -le 30 ]; then
GRAN_MIN=5
else
GRAN_MIN=60
fi
# Adjust repeats and intervals based on GRAN_MIN granularity
PERIOD_MIN=$[($PERIOD_MIN+(GRAN_MIN-1))/GRAN_MIN*GRAN_MIN]
((REPEATS = PERIOD_MIN/GRAN_MIN))
GRAN_MIN_IN_SEC=$[$GRAN_MIN*60]
if [ ${INTERVAL_SEC} -gt ${GRAN_MIN_IN_SEC} ]; then
INTERVAL_SEC=${GRAN_MIN_IN_SEC}
fi
# Define tools and options
# [ JGAULD - need config file for customization; long soak vs specific tools ]
# [ Ideally sample < 5 second granularity, but files get big, and tool has cpu overhead ]
# [ Need < 5 second granularity to see cache pressure/flush issues ]
# [ Desire 60 sec interval for soak ]
if [ ${OPT_SOAK} -eq 1 ]; then
# Desire 60 second or greater interval for longer term data collections,
# otherwise collection files get too big.
schedtop_interval=20
occtop_interval=60
memtop_interval=60
netstats_interval=60
# JGAULD: temporarily increase frequency to 1 min
postgres_interval=${DUR_1MIN_IN_SEC}
#postgres_interval=${DUR_15MIN_IN_SEC}
rabbitmq_interval=${DUR_15MIN_IN_SEC}
ceph_interval=${DUR_15MIN_IN_SEC}
diskstats_interval=${DUR_15MIN_IN_SEC}
memstats_interval=${DUR_15MIN_IN_SEC}
filestats_interval=${DUR_15MIN_IN_SEC}
elif [ ${OPT_SOAK} -eq 2 ]; then
# Assume much shorter collection (eg, < hours)
schedtop_interval=2 # i.e., 2 second interval
occtop_interval=2 # i.e., 2 second interval
memtop_interval=1 # i.e., 1 second interval
netstats_interval=30 # i.e., 30 second interval
postgres_interval=${DUR_5MIN_IN_SEC}
rabbitmq_interval=${DUR_5MIN_IN_SEC}
ceph_interval=${DUR_5MIN_IN_SEC}
diskstats_interval=${DUR_5MIN_IN_SEC}
memstats_interval=${DUR_5MIN_IN_SEC}
filestats_interval=${DUR_5MIN_IN_SEC}
else
# Assume shorter collection (eg, < a few hours)
schedtop_interval=5 # i.e., 5 second interval
occtop_interval=5 # i.e., 5 second interval
memtop_interval=5 # i.e., 5 second interval
netstats_interval=30 # i.e., 30 second interval
postgres_interval=${DUR_5MIN_IN_SEC}
rabbitmq_interval=${DUR_5MIN_IN_SEC}
ceph_interval=${DUR_5MIN_IN_SEC}
diskstats_interval=${DUR_5MIN_IN_SEC}
memstats_interval=${DUR_5MIN_IN_SEC}
filestats_interval=${DUR_5MIN_IN_SEC}
fi
schedtop_repeat=$[ $PERIOD_MIN * 60 / $schedtop_interval ]
occtop_repeat=$[ $PERIOD_MIN * 60 / $occtop_interval ]
memtop_repeat=$[ $PERIOD_MIN * 60 / $memtop_interval ]
netstats_repeat=$[ $PERIOD_MIN * 60 / $netstats_interval ]
# Disable use of INTERVAL_SEC sample interval
OPT_USE_INTERVALS=0
# Define parallel engtools configuration
# - tool name, filename, and collection interval attributes
BINDIR=/usr/bin
LBINDIR=/usr/local/bin
. /etc/engtools/engtools.conf
declare -a tlist
if [[ ${ENABLE_STATIC_COLLECTION} == "Y" ]] || [[ ${ENABLE_STATIC_COLLECTION} == "y" ]]; then
tlist+=( "tool=${LBINDIR}/top.sh name=top period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" )
tlist+=( "tool=${LBINDIR}/iostat.sh name=iostat period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" )
tlist+=( "tool=${LBINDIR}/netstats.sh name=netstats period=${PERIOD_MIN} interval=${netstats_interval}" )
tlist+=( "tool=${BINDIR}/occtop name=occtop repeat=${occtop_repeat} interval=${occtop_interval}" )
tlist+=( "tool=${BINDIR}/memtop name=memtop repeat=${memtop_repeat} interval=${memtop_interval}" )
tlist+=( "tool=${BINDIR}/schedtop name=schedtop repeat=${schedtop_repeat} interval=${schedtop_interval}" )
tlist+=( "tool=${LBINDIR}/diskstats.sh name=diskstats period=${PERIOD_MIN} interval=${diskstats_interval}" )
tlist+=( "tool=${LBINDIR}/memstats.sh name=memstats period=${PERIOD_MIN} interval=${memstats_interval}" )
tlist+=( "tool=${LBINDIR}/filestats.sh name=filestats period=${PERIOD_MIN} interval=${filestats_interval}" )
if [[ "${HOSTNAME}" =~ "controller-" ]]; then
tlist+=( "tool=${LBINDIR}/ceph.sh name=ceph period=${PERIOD_MIN} interval=${ceph_interval}" )
tlist+=( "tool=${LBINDIR}/postgres.sh name=postgres period=${PERIOD_MIN} interval=${postgres_interval}" )
tlist+=( "tool=${LBINDIR}/rabbitmq.sh name=rabbitmq period=${PERIOD_MIN} interval=${rabbitmq_interval}" )
elif [[ "${HOSTNAME}" =~ "compute-" ]]; then
tlist+=( "tool=${LBINDIR}/vswitch.sh name=vswitch period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" )
fi
# ticker - shows progress on the screen
tlist+=( "tool=${LBINDIR}/ticker.sh name= period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" )
fi
if [[ ${ENABLE_LIVE_STREAM} == "Y" ]] || [[ ${ENABLE_LIVE_STREAM} == "y" ]]; then
${TOOLBIN}/live_stream.py &
fi
#-------------------------------------------------------------------------------
# Main loop
#-------------------------------------------------------------------------------
OPT_DEBUG=0
REP=0
if [ ${#tlist[@]} -ne 0 ]; then
# Static stats collection is turned on
while [[ ${TOOL_USR1_SIGNAL} -eq 0 ]] &&
[[ ${OPT_FOREVER} -eq 1 || ${REP} -lt ${REPEATS} ]]
do
# increment loop counter
((REP++))
# purge oldest files
purge_oldest_files
# define filename timestamp
timestamp=$( date +"%Y-%0m-%0e_%H%M" )
# collect tools in parallel to separate output files
LOG "collecting ${TOOLNAME} at ${timestamp} for ${PERIOD_MIN} mins, repeat=${REP}"
do_parallel_commands
wait
# Compress latest increment
LOG "compressing: ${parallel_outfiles[@]}"
${CMD_IDLE} bzip2 -q -f ${parallel_outfiles[@]} 2>/dev/null &
done
# Wait for the compression to complete
wait
tools_cleanup 0
fi
# Should wait here in case live stats streaming is turned on.
wait
exit 0