Various engtools fixes
- Change engtools init order to ensure that stats streaming agents on the compute and storage nodes do not start prematurely after DOR - Workaround a systemd preun scriptlet issue that caused patch removal failure - Stream database stats in batches (max 10 DBs/batch) - Account for new processes Story: 2002895 Task: 22858 Change-Id: Iaeeca7f51b442c27fc475777abc612d53dc97ce5 Signed-off-by: Jack Ding <jack.ding@windriver.com> Signed-off-by: Scott Little <scott.little@windriver.com>
This commit is contained in:
parent
30ab05568b
commit
610a9282c9
@ -10,7 +10,7 @@ BuildArch: noarch
|
||||
Source: %{name}-%{version}.tar.gz
|
||||
|
||||
BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root
|
||||
|
||||
BuildRequires: systemd
|
||||
Requires: iperf3
|
||||
|
||||
%description
|
||||
|
@ -15,7 +15,7 @@ DURATION=
|
||||
|
||||
[StaticCollection]
|
||||
# Set this option to Y/N before patch creation to enable/disable static stats collection
|
||||
ENABLE_STATIC_COLLECTION=Y
|
||||
ENABLE_STATIC_COLLECTION=N
|
||||
|
||||
[CollectInternal]
|
||||
# controller external OAM interface used to communicate with remote server. If unset, the first interface from ifconfig will be used
|
||||
@ -46,7 +46,7 @@ filestats=30
|
||||
netstats=10
|
||||
postgres=30
|
||||
rabbitmq=3600
|
||||
vswitch=30
|
||||
vswitch=120
|
||||
|
||||
[AdditionalOptions]
|
||||
# Set this option to Y/N to enable/disable Openstack API GET/POST collection
|
||||
@ -62,10 +62,10 @@ FAST_POSTGRES_CONNECTIONS=N
|
||||
AUTO_DELETE_DB=N
|
||||
|
||||
[ControllerServices]
|
||||
CONTROLLER_SERVICE_LIST=aodh-api aodh-listener aodh-notifier aodh-evaluator beam.smp ceilometer-api ceilometer-collector ceilometer-agent-notification ceilometer-mem-db ceph-mon ceph-rest-api ceph-alarm-manager cinder-api cinder-volume cinder-scheduler glance-api glance-registry heat-api heat-engine heat-api-cfn heat-api-cloudwatch hbsAgent ironic-api ironic-conductor keystone-all magnum-api magnum-conductor neutron-server nova-api nova-api-proxy nova-compute nova-scheduler nova-conductor nova-console-auth nova-novncproxy nova-placement-api panko-api sysinv-api sysinv-conductor postgres fmManager rabbitmq-server gunicorn postgres snmpd patch-alarm-manager lighttpd sw-patch-controller-daemon nfv-vim nfv-vim-api nfv-vim-webserver slapd mtcAgent guestAgent
|
||||
CONTROLLER_SERVICE_LIST=aodh-api aodh-listener aodh-notifier aodh-evaluator beam.smp ceilometer-api ceilometer-collector ceilometer-agent-notification ceilometer-mem-db ceph-mon ceph-rest-api ceph-alarm-manager cinder-api cinder-volume cinder-scheduler glance-api glance-registry gnocchi-api gnocchi-metricd heat-api heat-engine heat-api-cfn heat-api-cloudwatch hbsAgent ironic-api ironic-conductor magnum-api magnum-conductor neutron-server nova-api nova-api-proxy nova-compute nova-scheduler nova-conductor nova-console-auth nova-novncproxy nova-placement-api panko-api sysinv-api sysinv-conductor postgres fmManager rabbitmq-server gunicorn postgres snmpd patch-alarm-manager lighttpd sw-patch-controller-daemon nfv-vim nfv-vim-api nfv-vim-webserver slapd mtcAgent guestAgent dcmanager-api dcmanager-manager dcorch-engine dcorch-neutron-api-proxy dcorch-nova-api-proxy dcorch-patch-api-proxy dcorch-snmp dcorch-sysinv-api-proxy memcached influxd
|
||||
|
||||
[ComputeServices]
|
||||
COMPUTE_SERVICE_LIST=nova-compute neutron-dhcp-agent neutron-metadata-agent neutron-sriov-nic-agent kvm libvirtd guestServer host_agent
|
||||
COMPUTE_SERVICE_LIST=nova-compute neutron-dhcp-agent neutron-metadata-agent neutron-sriov-nic-agent kvm libvirtd guestServer host_agent dmeventd virtlockd
|
||||
|
||||
[StorageServices]
|
||||
STORAGE_SERVICE_LIST=ceph-mon ceph-osd ceph-manager ceph-rest-api
|
||||
@ -74,4 +74,4 @@ STORAGE_SERVICE_LIST=ceph-mon ceph-osd ceph-manager ceph-rest-api
|
||||
RABBITMQ_QUEUE_LIST=notifications.info versioned_notifications.info
|
||||
|
||||
[CommonServices]
|
||||
COMMON_SERVICE_LIST=dnsmasq ceilometer-polling haproxy hwmond pmond rmond fsmond sw-patch-agent sysinv-agent syslog-ng hostwd iscsid io-monitor-manager acpid hbsClient logmgmt mtcClient mtcalarmd mtclogd sshd ntpd smartd sm sm-eru sm-watchdog sm-api ceilometer keyring cinder-rtstool
|
||||
COMMON_SERVICE_LIST=dnsmasq ceilometer-polling haproxy hwmond pmond rmond fsmond sw-patch-agent sysinv-agent syslog-ng hostwd iscsid io-monitor-manager acpid hbsClient logmgmt mtcClient mtcalarmd mtclogd sshd ntpd smartd sm sm-eru sm-watchdog sm-api ceilometer keyring cinder-rtstool tuned polkitd lldpd IPaddr2 dnsmasq systemd-udevd systemd-journald logrotate collectd
|
||||
|
@ -1,6 +1,7 @@
|
||||
[Unit]
|
||||
Description=Engineering data collection tools to monitor host performance
|
||||
After=network.service
|
||||
Requires=network.service
|
||||
After=network.service getty.target
|
||||
|
||||
[Service]
|
||||
Type=forking
|
||||
|
@ -270,14 +270,10 @@ OPT_USE_INTERVALS=0
|
||||
BINDIR=/usr/bin
|
||||
LBINDIR=/usr/local/bin
|
||||
|
||||
while IFS='' read -r line || [[ -n "$line" ]]; do
|
||||
if [[ $line =~ 'ENABLE_STATIC_COLLECTION'* ]]; then
|
||||
static_collection=${line:25:1}
|
||||
fi
|
||||
done < /etc/engtools/engtools.conf
|
||||
. /etc/engtools/engtools.conf
|
||||
|
||||
declare -a tlist
|
||||
if [[ $static_collection == "Y" ]] || [[ $static_collection == "y" ]]; then
|
||||
if [[ ${ENABLE_STATIC_COLLECTION} == "Y" ]] || [[ ${ENABLE_STATIC_COLLECTION} == "y" ]]; then
|
||||
tlist+=( "tool=${LBINDIR}/top.sh name=top period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" )
|
||||
tlist+=( "tool=${LBINDIR}/iostat.sh name=iostat period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" )
|
||||
tlist+=( "tool=${LBINDIR}/netstats.sh name=netstats period=${PERIOD_MIN} interval=${netstats_interval}" )
|
||||
@ -290,45 +286,55 @@ if [[ $static_collection == "Y" ]] || [[ $static_collection == "y" ]]; then
|
||||
if [[ "${HOSTNAME}" =~ "controller-" ]]; then
|
||||
tlist+=( "tool=${LBINDIR}/ceph.sh name=ceph period=${PERIOD_MIN} interval=${ceph_interval}" )
|
||||
tlist+=( "tool=${LBINDIR}/postgres.sh name=postgres period=${PERIOD_MIN} interval=${postgres_interval}" )
|
||||
# tlist+=( "tool=${LBINDIR}/rabbitmq.sh name=rabbitmq period=${PERIOD_MIN} interval=${rabbitmq_interval}" )
|
||||
tlist+=( "tool=${LBINDIR}/rabbitmq.sh name=rabbitmq period=${PERIOD_MIN} interval=${rabbitmq_interval}" )
|
||||
elif [[ "${HOSTNAME}" =~ "compute-" ]]; then
|
||||
tlist+=( "tool=${LBINDIR}/vswitch.sh name=vswitch period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" )
|
||||
fi
|
||||
|
||||
# ticker - shows progress on the screen
|
||||
tlist+=( "tool=${LBINDIR}/ticker.sh name= period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" )
|
||||
fi
|
||||
|
||||
# ticker - shows progress on the screen
|
||||
tlist+=( "tool=${LBINDIR}/ticker.sh name= period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" )
|
||||
|
||||
if [[ ${ENABLE_LIVE_STREAM} == "Y" ]] || [[ ${ENABLE_LIVE_STREAM} == "y" ]]; then
|
||||
${TOOLBIN}/live_stream.py &
|
||||
fi
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# Main loop
|
||||
#-------------------------------------------------------------------------------
|
||||
OPT_DEBUG=0
|
||||
REP=0
|
||||
while [[ ${TOOL_USR1_SIGNAL} -eq 0 ]] &&
|
||||
[[ ${OPT_FOREVER} -eq 1 || ${REP} -lt ${REPEATS} ]]
|
||||
do
|
||||
# increment loop counter
|
||||
((REP++))
|
||||
|
||||
# purge oldest files
|
||||
purge_oldest_files
|
||||
if [ ${#tlist[@]} -ne 0 ]; then
|
||||
# Static stats collection is turned on
|
||||
while [[ ${TOOL_USR1_SIGNAL} -eq 0 ]] &&
|
||||
[[ ${OPT_FOREVER} -eq 1 || ${REP} -lt ${REPEATS} ]]
|
||||
do
|
||||
# increment loop counter
|
||||
((REP++))
|
||||
|
||||
# define filename timestamp
|
||||
timestamp=$( date +"%Y-%0m-%0e_%H%M" )
|
||||
# purge oldest files
|
||||
purge_oldest_files
|
||||
|
||||
# collect tools in parallel to separate output files
|
||||
LOG "collecting ${TOOLNAME} at ${timestamp} for ${PERIOD_MIN} mins, repeat=${REP}"
|
||||
do_parallel_commands
|
||||
# define filename timestamp
|
||||
timestamp=$( date +"%Y-%0m-%0e_%H%M" )
|
||||
|
||||
# collect tools in parallel to separate output files
|
||||
LOG "collecting ${TOOLNAME} at ${timestamp} for ${PERIOD_MIN} mins, repeat=${REP}"
|
||||
do_parallel_commands
|
||||
wait
|
||||
|
||||
# Compress latest increment
|
||||
LOG "compressing: ${parallel_outfiles[@]}"
|
||||
${CMD_IDLE} bzip2 -q -f ${parallel_outfiles[@]} 2>/dev/null &
|
||||
done
|
||||
|
||||
# Wait for the compression to complete
|
||||
wait
|
||||
tools_cleanup 0
|
||||
fi
|
||||
|
||||
# Compress latest increment
|
||||
LOG "compressing: ${parallel_outfiles[@]}"
|
||||
${CMD_IDLE} bzip2 -q -f ${parallel_outfiles[@]} 2>/dev/null &
|
||||
done
|
||||
|
||||
# wait for compression to complete
|
||||
# Should wait here in case live stats streaming is turned on.
|
||||
wait
|
||||
|
||||
tools_cleanup 0
|
||||
exit 0
|
||||
|
@ -60,7 +60,6 @@ case $1 in
|
||||
log_daemon_msg "Starting ${NAME}"
|
||||
if start-stop-daemon --start --background --quiet --oknodo --pidfile ${PIDFILE} \
|
||||
--exec ${DAEMON} -- ${DAEMON_ARGS} ; then
|
||||
./usr/local/bin/live_stream.py &
|
||||
log_end_msg 0
|
||||
else
|
||||
log_end_msg 1
|
||||
|
@ -167,6 +167,10 @@ def collectMemstats(influx_info, node, ci, services, syseng_services, openstack_
|
||||
fields[gsvc]["vsz"] += vsz
|
||||
|
||||
elif svc == "postgres":
|
||||
if (len(line) <= i+2):
|
||||
# Command line could be "sudo su postgres", skip it
|
||||
break
|
||||
|
||||
if line[i + 1].startswith("-") is False and line[i + 1].startswith("_") is False and line[i + 1] != "psql":
|
||||
psvc = ""
|
||||
if line[i + 2] in openstack_services:
|
||||
@ -284,6 +288,10 @@ def collectSchedtop(influx_info, node, ci, services, syseng_services, openstack_
|
||||
fields[gsvc] += occ
|
||||
|
||||
elif svc == "postgres":
|
||||
if (len(line) <= i+2):
|
||||
# Command line could be "sudo su postgres", skip it
|
||||
break
|
||||
|
||||
if line[i + 1].startswith("-") is False and line[i + 1].startswith("_") is False and line[i + 1] != "psql":
|
||||
psvc = ""
|
||||
if line[i + 2] in openstack_services:
|
||||
@ -589,20 +597,22 @@ def collectPostgres(influx_info, node, ci):
|
||||
postgres_output = postgres_output1 = None
|
||||
influx_string = influx_string1 = ""
|
||||
good_string = False
|
||||
dbcount = 0
|
||||
BATCH_SIZE = 10
|
||||
|
||||
while True:
|
||||
try:
|
||||
# make sure this is active controller, otherwise postgres queries wont work
|
||||
if isActiveController():
|
||||
while True:
|
||||
# get list of databases and their sizes
|
||||
postgres_output = Popen("sudo -u postgres psql --pset pager=off -q -t -c'SELECT datname, pg_database_size(datname) FROM pg_database WHERE datistemplate = false;'", shell=True, stdout=PIPE)
|
||||
lines = postgres_output.stdout.read().replace(" ", "").strip().split("\n")
|
||||
if lines == "" or lines is None:
|
||||
db_lines = postgres_output.stdout.read().replace(" ", "").strip().split("\n")
|
||||
if db_lines == "" or db_lines is None:
|
||||
postgres_output.kill()
|
||||
break
|
||||
else:
|
||||
# for each database from the previous output
|
||||
for line in lines:
|
||||
for line in db_lines:
|
||||
if not line:
|
||||
break
|
||||
line = line.replace(" ", "").split("|")
|
||||
@ -613,8 +623,8 @@ def collectPostgres(influx_info, node, ci):
|
||||
# get tables for each database
|
||||
sql = "SELECT table_schema,table_name,pg_size_pretty(table_size) AS table_size,pg_size_pretty(indexes_size) AS indexes_size,pg_size_pretty(total_size) AS total_size,live_tuples,dead_tuples FROM (SELECT table_schema,table_name,pg_table_size(table_name) AS table_size,pg_indexes_size(table_name) AS indexes_size,pg_total_relation_size(table_name) AS total_size,pg_stat_get_live_tuples(table_name::regclass) AS live_tuples,pg_stat_get_dead_tuples(table_name::regclass) AS dead_tuples FROM (SELECT table_schema,table_name FROM information_schema.tables WHERE table_schema='public' AND table_type='BASE TABLE') AS all_tables ORDER BY total_size DESC) AS pretty_sizes;"
|
||||
postgres_output1 = Popen('sudo -u postgres psql --pset pager=off -q -t -d{} -c"{}"'.format(line[0], sql), shell=True, stdout=PIPE)
|
||||
lines = postgres_output1.stdout.read().replace(" ", "").strip().split("\n")
|
||||
for line in lines:
|
||||
tbl_lines = postgres_output1.stdout.read().replace(" ", "").strip().split("\n")
|
||||
for line in tbl_lines:
|
||||
if line == "":
|
||||
continue
|
||||
else:
|
||||
@ -648,6 +658,13 @@ def collectPostgres(influx_info, node, ci):
|
||||
fields1["dead_tuples"] = int(elements[6])
|
||||
influx_string1 += "{},'{}'='{}','{}'='{}','{}'='{}','{}'='{}' '{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}'".format(measurement1, "node", tags["node"], "service", tags["service"], "table_schema", tags["table_schema"], "table", tags["table"], "table_size", fields1["table_size"], "index_size", fields1["index_size"], "total_size", fields1["total_size"], "live_tuples", fields1["live_tuples"], "dead_tuples", fields1["dead_tuples"]) + "\n"
|
||||
good_string = True
|
||||
dbcount += 1
|
||||
if dbcount == BATCH_SIZE and good_string:
|
||||
# Curl will barf if the batch is too large
|
||||
p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string1), shell=True)
|
||||
p.communicate()
|
||||
influx_string1 = ""
|
||||
dbcount = 0
|
||||
if good_string:
|
||||
# send table data to InfluxDB
|
||||
p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string), shell=True)
|
||||
@ -655,6 +672,7 @@ def collectPostgres(influx_info, node, ci):
|
||||
p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string1), shell=True)
|
||||
p.communicate()
|
||||
influx_string = influx_string1 = ""
|
||||
dbcount = 0
|
||||
time.sleep(ci["postgres"])
|
||||
postgres_output1.kill()
|
||||
postgres_output.kill()
|
||||
@ -1331,7 +1349,7 @@ if __name__ == "__main__":
|
||||
live_svc = ("live_stream.py",)
|
||||
static_svcs = ("occtop", "memtop", "schedtop", "top.sh", "iostat.sh", "netstats.sh", "diskstats.sh", "memstats.sh", "filestats.sh", "ceph.sh", "postgres.sh", "rabbitmq.sh", "vswitch.sh")
|
||||
collection_intervals = {"memtop": None, "memstats": None, "occtop": None, "schedtop": None, "load_avg": None, "cpu_count": None, "diskstats": None, "iostat": None, "filestats": None, "netstats": None, "postgres": None, "rabbitmq": None, "vswitch": None}
|
||||
openstack_services = ("nova", "cinder", "aodh", "ceilometer", "heat", "glance", "ceph", "horizon", "keystone", "puppet", "sysinv", "neutron", "nova_api", "postgres")
|
||||
openstack_services = ("nova", "cinder", "aodh", "ceilometer", "heat", "glance", "ceph", "horizon", "keystone", "puppet", "sysinv", "neutron", "nova_api", "postgres", "panko", "nova_cell0", "magnum", "ironic", "murano", "gnocchi")
|
||||
# memstats, schedtop, and filestats must skip/exclude certain fields when collect_all is enabled. No need to collect this stuff
|
||||
exclude_list = ("python", "python2", "bash", "perl", "sudo", "init")
|
||||
skip_list = ("ps", "top", "sh", "<defunct>", "curl", "awk", "wc", "sleep", "lsof", "cut", "grep", "ip", "tail", "su")
|
||||
|
Loading…
Reference in New Issue
Block a user