integ/tools/collector/scripts/collect_utils
Eric MacDonald 3f23ded7d0 Improve collectd tool to handle additional unrecoverable tar error
There are cases when tar reports a different error string when it runs
out of space while tar'ing up a tarball ; "tar: error is not recoverable".

This update adds that string to the current list of strings that suggest
collect errored out due to space limitation.

Now that there are 3 error strings to look for, the block of script code that
checked for these error strings was re-factored to search a list of error
strings which makes the implementation cleaner and more maintainable.

If new error message strings are found in the future then such
strings need only be added to this new list.

Change-Id: Ifc677740c33372b48a4856ce00caea052580788d
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
Signed-off-by: Scott Little <scott.little@windriver.com>
2018-08-01 12:24:05 -04:00

237 lines
6.1 KiB
Bash
Executable File

#! /bin/bash
#
# Copyright (c) 2013-2014 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
##########################################################################################
DEBUG=false
# Fail Codes
PASS=0
FAIL=1
RETRY=2
FAIL_NODETYPE=3
FAIL_TIMEOUT=10
FAIL_TIMEOUT1=11
FAIL_TIMEOUT2=12
FAIL_TIMEOUT3=13
FAIL_TIMEOUT4=14
FAIL_TIMEOUT5=15
FAIL_TIMEOUT6=16
FAIL_TIMEOUT7=17
FAIL_TIMEOUT8=18
FAIL_TIMEOUT9=19
FAIL_PASSWORD=30
FAIL_PERMISSION=31
FAIL_CLEANUP=32
FAIL_UNREACHABLE=33
FAIL_HOSTNAME=34
FAIL_INACTIVE=35
FAIL_PERMISSION_SKIP=36
FAIL_OUT_OF_SPACE=37
FAIL_INSUFFICIENT_SPACE=38
FAIL_OUT_OF_SPACE_LOCAL=39
FAIL_CREATE=39
# Warnings are above 200
WARN_WARNING=200
WARN_HOSTNAME=201
# Failure Strings
FAIL_OUT_OF_SPACE_STR="No space left on device"
FAIL_TAR_OUT_OF_SPACE_STR="tar: Error is not recoverable"
FAIL_INSUFFICIENT_SPACE_STR="Not enough space on device"
# The minimum amount of % free space on /scratch to allow collect to proceed
MIN_PERCENT_SPACE_REQUIRED=75
# Log file path/names
COLLECT_LOG=/var/log/collect.log
COLLECT_ERROR_LOG=/tmp/collect_error.log
# Load up the nova openrc file if its possible
KEYRING_PATH="/opt/platform/.keyring"
if [ -e ${KEYRING_PATH} ] ; then
CRED=`find /opt/platform/.keyring -name .CREDENTIAL`
if [ ! -z "${CRED}" ] ; then
NOVAOPENRC="/etc/nova/openrc"
if [ -e ${NOVAOPENRC} ] ; then
ACTIVE=true
source ${NOVAOPENRC} 2>/dev/null 1>/dev/null
fi
fi
fi
# get the node and subfunction types
nodetype=""
subfunction=""
PLATFORM_CONF=/etc/platform/platform.conf
if [ -e ${PLATFORM_CONF} ] ; then
source ${PLATFORM_CONF}
fi
if [ "${nodetype}" != "controller" -a "${nodetype}" != "compute" -a "${nodetype}" != "storage" ] ; then
logger -t ${COLLECT_TAG} "could not identify nodetype ($nodetype)"
exit $FAIL_NODETYPE
fi
NODETYPE=$nodetype
SUBFUNCTION=$subfunction
# Setup an expect command completion file.
# This is used to force serialization of expect
# sequences and highlight command completion
collect_done="collect done"
cmd_done_sig="expect done"
cmd_done_file="/usr/local/sbin/expect_done"
# Compression Commands
TAR_ZIP_CMD="tar -cvzf"
TAR_UZIP_CMD="tar -xvzf"
TAR_CMD="tar -cvhf"
UNTAR_CMD="tar -xvf"
ZIP_CMD="gzip"
NICE_CMD="/usr/bin/nice -n19"
IONICE_CMD="/usr/bin/ionice -c2 -n7"
COLLECT_TAG="COLLECT"
STARTDATE_OPTION="--start-date"
ENDDATE_OPTION="--end-date"
PROCESS_DETAIL_CMD="ps -e -H -o ruser,tid,pid,ppid,flags,stat,policy,rtprio,nice,priority,rss:10,vsz:10,sz:10,psr,stime,tty,cputime,wchan:14,cmd"
BUILD_INFO_CMD="cat /etc/build.info"
################################################################################
# Log Debug, Info or Error log message to syslog
################################################################################
function log
{
logger -t ${COLLECT_TAG} $@
}
function ilog
{
echo "$@"
logger -t ${COLLECT_TAG} $@
#logger -p local3.info -t ${COLLECT_TAG} $@
}
function elog
{
echo "Error: $@"
logger -t ${COLLECT_TAG} $@
}
function wlog
{
echo "Warning: $@"
logger -t ${COLLECT_TAG} $@
}
function set_debug_mode()
{
DEBUG=${1}
}
function dlog()
{
if [ "$DEBUG" == true ] ; then
logger -t ${COLLECT_TAG} $@
echo "Debug: $@"
fi
}
function delimiter()
{
echo "--------------------------------------------------------------------" >> ${1} 2>>${COLLECT_ERROR_LOG}
echo "`date` : ${myhostname} : ${2}" >> ${1} 2>>${COLLECT_ERROR_LOG}
echo "--------------------------------------------------------------------" >> ${1} 2>>${COLLECT_ERROR_LOG}
}
function log_slabinfo()
{
PAGE_SIZE=$(getconf PAGE_SIZE)
cat /proc/slabinfo | awk -v page_size_B=${PAGE_SIZE} '
BEGIN {page_KiB = page_size_B/1024; TOT_KiB = 0;}
(NF == 17) {
gsub(/[<>]/, "");
printf("%-22s %11s %8s %8s %10s %12s %1s %5s %10s %12s %1s %12s %9s %11s %8s\n",
$2, $3, $4, $5, $6, $7, $8, $10, $11, $12, $13, $15, $16, $17, "KiB");
}
(NF == 16) {
num_objs=$3; obj_per_slab=$5; pages_per_slab=$6;
KiB = (obj_per_slab > 0) ? page_KiB*num_objs/obj_per_slab*pages_per_slab : 0;
TOT_KiB += KiB;
printf("%-22s %11d %8d %8d %10d %12d %1s %5d %10d %12d %1s %12d %9d %11d %8d\n",
$1, $2, $3, $4, $5, $6, $7, $9, $10, $11, $12, $14, $15, $16, KiB);
}
END {
printf("%-22s %11s %8s %8s %10s %12s %1s %5s %10s %12s %1s %12s %9s %11s %8d\n",
"TOTAL", "-", "-", "-", "-", "-", ":", "-", "-", "-", ":", "-", "-", "-", TOT_KiB);
}
' >> ${1} 2>>${COLLECT_ERROR_LOG}
}
###########################################################################
#
# Name : collect_errors
#
# Description: search COLLECT_ERROR_LOG for "No space left on device" logs
# Return 0 if no such logs are found.
# Return 1 if such logs are found
#
# Assumptions: Caller should assume a non-zero return as an indication of
# a corrupt or incomplete collect log
#
# Create logs and screen echos that record the error for the user.
#
# May look for other errors in the future
#
###########################################################################
listOfOutOfSpaceErrors=(
"${FAIL_OUT_OF_SPACE_STR}"
"${FAIL_TAR_OUT_OF_SPACE_STR}"
"${FAIL_INSUFFICIENT_SPACE_STR}"
)
function collect_errors()
{
local host=${1}
local RC=0
if [ -e "${COLLECT_ERROR_LOG}" ] ; then
## now loop through known space related error strings
index=0
while [ "x${listOfOutOfSpaceErrors[index]}" != "x" ]
do
grep -q "${listOfOutOfSpaceErrors[index]}" ${COLLECT_ERROR_LOG}
if [ "$?" == "0" ] ; then
string="failed to collect from ${host} (reason:${FAIL_OUT_OF_SPACE}:${FAIL_OUT_OF_SPACE_STR})"
# /var/log/user.log it
logger -t ${COLLECT_TAG} "${string}"
# logs that show up in the foreground
echo "${string}"
echo "Increase available space in ${host}:${COLLECT_BASE_DIR} and retry operation."
# return error code
RC=1
break
fi
index=$(($index+1))
done
fi
return ${RC}
}