Various engtools fixes

- Change engtools init order to ensure that stats streaming agents on the compute and storage nodes do not start prematurely after DOR - Workaround a systemd preun scriptlet issue that caused patch removal failure - Stream database stats in batches (max 10 DBs/batch) - Account for new processes Story: 2002895 Task: 22858 Change-Id: Iaeeca7f51b442c27fc475777abc612d53dc97ce5 Signed-off-by: Jack Ding <jack.ding@windriver.com> Signed-off-by: Scott Little <scott.little@windriver.com>
2018-06-12 16:52:34 -04:00 · 2018-06-12 16:52:34 -04:00 · 610a9282c9
commit 610a9282c9
parent 30ab05568b
6 changed files with 68 additions and 44 deletions
--- a/tools/engtools/hostdata-collectors/centos/collect-engtools.spec
+++ b/tools/engtools/hostdata-collectors/centos/collect-engtools.spec
@ -10,7 +10,7 @@ BuildArch: noarch
 Source: %{name}-%{version}.tar.gz

 BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root
-
+BuildRequires: systemd
 Requires: iperf3

 %description
--- a/tools/engtools/hostdata-collectors/scripts/cfg/engtools.conf
+++ b/tools/engtools/hostdata-collectors/scripts/cfg/engtools.conf
@ -15,7 +15,7 @@ DURATION=

 [StaticCollection]
 # Set this option to Y/N before patch creation to enable/disable static stats collection
-ENABLE_STATIC_COLLECTION=Y
+ENABLE_STATIC_COLLECTION=N

 [CollectInternal]
 # controller external OAM interface used to communicate with remote server. If unset, the first interface from ifconfig will be used
@ -46,7 +46,7 @@ filestats=30
 netstats=10
 postgres=30
 rabbitmq=3600
-vswitch=30
+vswitch=120

 [AdditionalOptions]
 # Set this option to Y/N to enable/disable Openstack API GET/POST collection
@ -62,10 +62,10 @@ FAST_POSTGRES_CONNECTIONS=N
 AUTO_DELETE_DB=N

 [ControllerServices]
-CONTROLLER_SERVICE_LIST=aodh-api aodh-listener aodh-notifier aodh-evaluator beam.smp ceilometer-api ceilometer-collector ceilometer-agent-notification ceilometer-mem-db ceph-mon ceph-rest-api ceph-alarm-manager cinder-api cinder-volume cinder-scheduler glance-api glance-registry heat-api heat-engine heat-api-cfn heat-api-cloudwatch hbsAgent ironic-api ironic-conductor keystone-all magnum-api magnum-conductor neutron-server nova-api nova-api-proxy nova-compute nova-scheduler nova-conductor nova-console-auth nova-novncproxy nova-placement-api panko-api sysinv-api sysinv-conductor postgres fmManager rabbitmq-server gunicorn postgres snmpd patch-alarm-manager lighttpd sw-patch-controller-daemon nfv-vim nfv-vim-api nfv-vim-webserver slapd mtcAgent guestAgent
+CONTROLLER_SERVICE_LIST=aodh-api aodh-listener aodh-notifier aodh-evaluator beam.smp ceilometer-api ceilometer-collector ceilometer-agent-notification ceilometer-mem-db ceph-mon ceph-rest-api ceph-alarm-manager cinder-api cinder-volume cinder-scheduler glance-api glance-registry gnocchi-api gnocchi-metricd heat-api heat-engine heat-api-cfn heat-api-cloudwatch hbsAgent ironic-api ironic-conductor magnum-api magnum-conductor neutron-server nova-api nova-api-proxy nova-compute nova-scheduler nova-conductor nova-console-auth nova-novncproxy nova-placement-api panko-api sysinv-api sysinv-conductor postgres fmManager rabbitmq-server gunicorn postgres snmpd patch-alarm-manager lighttpd sw-patch-controller-daemon nfv-vim nfv-vim-api nfv-vim-webserver slapd mtcAgent guestAgent dcmanager-api dcmanager-manager dcorch-engine dcorch-neutron-api-proxy dcorch-nova-api-proxy dcorch-patch-api-proxy dcorch-snmp dcorch-sysinv-api-proxy memcached influxd

 [ComputeServices]
-COMPUTE_SERVICE_LIST=nova-compute neutron-dhcp-agent neutron-metadata-agent neutron-sriov-nic-agent kvm libvirtd guestServer host_agent
+COMPUTE_SERVICE_LIST=nova-compute neutron-dhcp-agent neutron-metadata-agent neutron-sriov-nic-agent kvm libvirtd guestServer host_agent dmeventd virtlockd

 [StorageServices]
 STORAGE_SERVICE_LIST=ceph-mon ceph-osd ceph-manager ceph-rest-api
@ -74,4 +74,4 @@ STORAGE_SERVICE_LIST=ceph-mon ceph-osd ceph-manager ceph-rest-api
 RABBITMQ_QUEUE_LIST=notifications.info versioned_notifications.info

 [CommonServices]
-COMMON_SERVICE_LIST=dnsmasq ceilometer-polling haproxy hwmond pmond rmond fsmond sw-patch-agent sysinv-agent syslog-ng hostwd iscsid io-monitor-manager acpid hbsClient logmgmt mtcClient mtcalarmd mtclogd sshd ntpd smartd sm sm-eru sm-watchdog sm-api ceilometer keyring cinder-rtstool
+COMMON_SERVICE_LIST=dnsmasq ceilometer-polling haproxy hwmond pmond rmond fsmond sw-patch-agent sysinv-agent syslog-ng hostwd iscsid io-monitor-manager acpid hbsClient logmgmt mtcClient mtcalarmd mtclogd sshd ntpd smartd sm sm-eru sm-watchdog sm-api ceilometer keyring cinder-rtstool tuned polkitd lldpd IPaddr2 dnsmasq systemd-udevd systemd-journald logrotate collectd
--- a/tools/engtools/hostdata-collectors/scripts/collect-engtools.service
+++ b/tools/engtools/hostdata-collectors/scripts/collect-engtools.service
@ -1,6 +1,7 @@
 [Unit]
 Description=Engineering data collection tools to monitor host performance
-After=network.service
+Requires=network.service
+After=network.service getty.target

 [Service]
 Type=forking
--- a/tools/engtools/hostdata-collectors/scripts/collect-engtools.sh
+++ b/tools/engtools/hostdata-collectors/scripts/collect-engtools.sh
@ -270,14 +270,10 @@ OPT_USE_INTERVALS=0
 BINDIR=/usr/bin
 LBINDIR=/usr/local/bin

-while IFS='' read -r line || [[ -n "$line" ]]; do
-  if [[ $line =~ 'ENABLE_STATIC_COLLECTION'* ]]; then
-    static_collection=${line:25:1}
-  fi
-done < /etc/engtools/engtools.conf
+. /etc/engtools/engtools.conf

 declare -a tlist
-if [[ $static_collection == "Y" ]] || [[ $static_collection == "y" ]]; then
+if [[ ${ENABLE_STATIC_COLLECTION} == "Y" ]] || [[ ${ENABLE_STATIC_COLLECTION} == "y" ]]; then
  tlist+=( "tool=${LBINDIR}/top.sh name=top period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" )
  tlist+=( "tool=${LBINDIR}/iostat.sh name=iostat period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" )
  tlist+=( "tool=${LBINDIR}/netstats.sh name=netstats period=${PERIOD_MIN} interval=${netstats_interval}" )
@ -290,45 +286,55 @@ if [[ $static_collection == "Y" ]] || [[ $static_collection == "y" ]]; then
  if [[ "${HOSTNAME}" =~ "controller-" ]]; then
    tlist+=( "tool=${LBINDIR}/ceph.sh name=ceph period=${PERIOD_MIN} interval=${ceph_interval}" )
    tlist+=( "tool=${LBINDIR}/postgres.sh name=postgres period=${PERIOD_MIN} interval=${postgres_interval}" )
-    # tlist+=( "tool=${LBINDIR}/rabbitmq.sh name=rabbitmq period=${PERIOD_MIN} interval=${rabbitmq_interval}" )
+    tlist+=( "tool=${LBINDIR}/rabbitmq.sh name=rabbitmq period=${PERIOD_MIN} interval=${rabbitmq_interval}" )
  elif [[ "${HOSTNAME}" =~ "compute-" ]]; then
    tlist+=( "tool=${LBINDIR}/vswitch.sh name=vswitch period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" )
  fi
+
+  # ticker - shows progress on the screen
+  tlist+=( "tool=${LBINDIR}/ticker.sh name= period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" )
 fi

-# ticker - shows progress on the screen
-tlist+=( "tool=${LBINDIR}/ticker.sh name= period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" )
-
+if [[ ${ENABLE_LIVE_STREAM} == "Y" ]] || [[ ${ENABLE_LIVE_STREAM} == "y" ]]; then
+  ${TOOLBIN}/live_stream.py &
+fi

 #-------------------------------------------------------------------------------
 # Main loop
 #-------------------------------------------------------------------------------
 OPT_DEBUG=0
 REP=0
-while [[ ${TOOL_USR1_SIGNAL} -eq 0 ]] &&
-      [[ ${OPT_FOREVER} -eq 1 || ${REP} -lt ${REPEATS} ]]
-do
-  # increment loop counter
-  ((REP++))

-  # purge oldest files
-  purge_oldest_files
+if [ ${#tlist[@]} -ne 0 ]; then
+  # Static stats collection is turned on 
+  while [[ ${TOOL_USR1_SIGNAL} -eq 0 ]] &&
+        [[ ${OPT_FOREVER} -eq 1 || ${REP} -lt ${REPEATS} ]]
+  do
+    # increment loop counter
+    ((REP++))

-  # define filename timestamp
-  timestamp=$( date +"%Y-%0m-%0e_%H%M" )
+    # purge oldest files
+    purge_oldest_files

-  # collect tools in parallel to separate output files
-  LOG "collecting ${TOOLNAME} at ${timestamp} for ${PERIOD_MIN} mins, repeat=${REP}"
-  do_parallel_commands
+    # define filename timestamp
+    timestamp=$( date +"%Y-%0m-%0e_%H%M" )
+
+    # collect tools in parallel to separate output files
+    LOG "collecting ${TOOLNAME} at ${timestamp} for ${PERIOD_MIN} mins, repeat=${REP}"
+    do_parallel_commands
+    wait
+
+    # Compress latest increment
+    LOG "compressing: ${parallel_outfiles[@]}"
+    ${CMD_IDLE} bzip2 -q -f ${parallel_outfiles[@]} 2>/dev/null &
+  done
+  
+  # Wait for the compression to complete
  wait
+  tools_cleanup 0
+fi

-  # Compress latest increment
-  LOG "compressing: ${parallel_outfiles[@]}"
-  ${CMD_IDLE} bzip2 -q -f ${parallel_outfiles[@]} 2>/dev/null &
-done
-
-# wait for compression to complete
+# Should wait here in case live stats streaming is turned on.
 wait

-tools_cleanup 0
 exit 0
--- a/tools/engtools/hostdata-collectors/scripts/init.d/collect-engtools.sh
+++ b/tools/engtools/hostdata-collectors/scripts/init.d/collect-engtools.sh
@ -60,7 +60,6 @@ case $1 in
    log_daemon_msg "Starting ${NAME}"
    if start-stop-daemon --start --background --quiet --oknodo --pidfile ${PIDFILE} \
                         --exec ${DAEMON} -- ${DAEMON_ARGS} ; then
-      ./usr/local/bin/live_stream.py &
      log_end_msg 0
    else
      log_end_msg 1
--- a/tools/engtools/hostdata-collectors/scripts/live_stream.py
+++ b/tools/engtools/hostdata-collectors/scripts/live_stream.py
@ -167,6 +167,10 @@ def collectMemstats(influx_info, node, ci, services, syseng_services, openstack_
                                fields[gsvc]["vsz"] += vsz

                        elif svc == "postgres":
+                            if (len(line) <= i+2):
+                                # Command line could be "sudo su postgres", skip it
+                                break
+
                            if line[i + 1].startswith("-") is False and line[i + 1].startswith("_") is False and line[i + 1] != "psql":
                                psvc = ""
                                if line[i + 2] in openstack_services:
@ -284,6 +288,10 @@ def collectSchedtop(influx_info, node, ci, services, syseng_services, openstack_
                                    fields[gsvc] += occ

                            elif svc == "postgres":
+                                if (len(line) <= i+2):
+                                    # Command line could be "sudo su postgres", skip it
+                                    break
+
                                if line[i + 1].startswith("-") is False and line[i + 1].startswith("_") is False and line[i + 1] != "psql":
                                    psvc = ""
                                    if line[i + 2] in openstack_services:
@ -589,20 +597,22 @@ def collectPostgres(influx_info, node, ci):
    postgres_output = postgres_output1 = None
    influx_string = influx_string1 = ""
    good_string = False
+    dbcount = 0
+    BATCH_SIZE = 10
+
    while True:
        try:
            # make sure this is active controller, otherwise postgres queries wont work
            if isActiveController():
                while True:
-                    # get list of databases and their sizes
                    postgres_output = Popen("sudo -u postgres psql --pset pager=off -q -t -c'SELECT datname, pg_database_size(datname) FROM pg_database WHERE datistemplate = false;'", shell=True, stdout=PIPE)
-                    lines = postgres_output.stdout.read().replace(" ", "").strip().split("\n")
-                    if lines == "" or lines is None:
+                    db_lines = postgres_output.stdout.read().replace(" ", "").strip().split("\n")
+                    if db_lines == "" or db_lines is None:
                        postgres_output.kill()
                        break
                    else:
                        # for each database from the previous output
-                        for line in lines:
+                        for line in db_lines:
                            if not line:
                                break
                            line = line.replace(" ", "").split("|")
@ -613,8 +623,8 @@ def collectPostgres(influx_info, node, ci):
                            # get tables for each database
                            sql = "SELECT table_schema,table_name,pg_size_pretty(table_size) AS table_size,pg_size_pretty(indexes_size) AS indexes_size,pg_size_pretty(total_size) AS total_size,live_tuples,dead_tuples FROM (SELECT table_schema,table_name,pg_table_size(table_name) AS table_size,pg_indexes_size(table_name) AS indexes_size,pg_total_relation_size(table_name) AS total_size,pg_stat_get_live_tuples(table_name::regclass) AS live_tuples,pg_stat_get_dead_tuples(table_name::regclass) AS dead_tuples FROM (SELECT table_schema,table_name FROM information_schema.tables WHERE table_schema='public' AND table_type='BASE TABLE') AS all_tables ORDER BY total_size DESC) AS pretty_sizes;"
                            postgres_output1 = Popen('sudo -u postgres psql --pset pager=off -q -t -d{} -c"{}"'.format(line[0], sql), shell=True, stdout=PIPE)
-                            lines = postgres_output1.stdout.read().replace(" ", "").strip().split("\n")
-                            for line in lines:
+                            tbl_lines = postgres_output1.stdout.read().replace(" ", "").strip().split("\n")
+                            for line in tbl_lines:
                                if line == "":
                                    continue
                                else:
@ -648,6 +658,13 @@ def collectPostgres(influx_info, node, ci):
                                        fields1["dead_tuples"] = int(elements[6])
                                        influx_string1 += "{},'{}'='{}','{}'='{}','{}'='{}','{}'='{}' '{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}'".format(measurement1, "node", tags["node"], "service", tags["service"], "table_schema", tags["table_schema"], "table", tags["table"], "table_size", fields1["table_size"], "index_size", fields1["index_size"], "total_size", fields1["total_size"], "live_tuples", fields1["live_tuples"], "dead_tuples", fields1["dead_tuples"]) + "\n"
                                        good_string = True
+			    dbcount += 1
+			    if dbcount == BATCH_SIZE and good_string:
+				# Curl will barf if the batch is too large
+				p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string1), shell=True)
+				p.communicate()
+			       	influx_string1 = ""
+				dbcount = 0
                        if good_string:
                            # send table data to InfluxDB
                            p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string), shell=True)
@ -655,6 +672,7 @@ def collectPostgres(influx_info, node, ci):
                            p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string1), shell=True)
                            p.communicate()
                            influx_string = influx_string1 = ""
+                            dbcount = 0
                            time.sleep(ci["postgres"])
                        postgres_output1.kill()
                        postgres_output.kill()
@ -1331,7 +1349,7 @@ if __name__ == "__main__":
    live_svc = ("live_stream.py",)
    static_svcs = ("occtop", "memtop", "schedtop", "top.sh", "iostat.sh", "netstats.sh", "diskstats.sh", "memstats.sh", "filestats.sh", "ceph.sh", "postgres.sh", "rabbitmq.sh", "vswitch.sh")
    collection_intervals = {"memtop": None, "memstats": None, "occtop": None, "schedtop": None, "load_avg": None, "cpu_count": None, "diskstats": None, "iostat": None, "filestats": None, "netstats": None, "postgres": None, "rabbitmq": None, "vswitch": None}
-    openstack_services = ("nova", "cinder", "aodh", "ceilometer", "heat", "glance", "ceph", "horizon", "keystone", "puppet", "sysinv", "neutron", "nova_api", "postgres")
+    openstack_services = ("nova", "cinder", "aodh", "ceilometer", "heat", "glance", "ceph", "horizon", "keystone", "puppet", "sysinv", "neutron", "nova_api", "postgres", "panko", "nova_cell0", "magnum", "ironic", "murano", "gnocchi")
    # memstats, schedtop, and filestats must skip/exclude certain fields when collect_all is enabled. No need to collect this stuff
    exclude_list = ("python", "python2", "bash", "perl", "sudo", "init")
    skip_list = ("ps", "top", "sh", "<defunct>", "curl", "awk", "wc", "sleep", "lsof", "cut", "grep", "ip", "tail", "su")