From 610a9282c9217d72ba8ab924313bc9311f51fa59 Mon Sep 17 00:00:00 2001 From: Tee Ngo Date: Tue, 12 Jun 2018 16:52:34 -0400 Subject: [PATCH] Various engtools fixes - Change engtools init order to ensure that stats streaming agents on the compute and storage nodes do not start prematurely after DOR - Workaround a systemd preun scriptlet issue that caused patch removal failure - Stream database stats in batches (max 10 DBs/batch) - Account for new processes Story: 2002895 Task: 22858 Change-Id: Iaeeca7f51b442c27fc475777abc612d53dc97ce5 Signed-off-by: Jack Ding Signed-off-by: Scott Little --- .../centos/collect-engtools.spec | 2 +- .../scripts/cfg/engtools.conf | 10 +-- .../scripts/collect-engtools.service | 3 +- .../scripts/collect-engtools.sh | 64 ++++++++++--------- .../scripts/init.d/collect-engtools.sh | 1 - .../scripts/live_stream.py | 32 ++++++++-- 6 files changed, 68 insertions(+), 44 deletions(-) diff --git a/tools/engtools/hostdata-collectors/centos/collect-engtools.spec b/tools/engtools/hostdata-collectors/centos/collect-engtools.spec index 91f2bb426..288577bce 100644 --- a/tools/engtools/hostdata-collectors/centos/collect-engtools.spec +++ b/tools/engtools/hostdata-collectors/centos/collect-engtools.spec @@ -10,7 +10,7 @@ BuildArch: noarch Source: %{name}-%{version}.tar.gz BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root - +BuildRequires: systemd Requires: iperf3 %description diff --git a/tools/engtools/hostdata-collectors/scripts/cfg/engtools.conf b/tools/engtools/hostdata-collectors/scripts/cfg/engtools.conf index b2b940da7..efe1b9b74 100644 --- a/tools/engtools/hostdata-collectors/scripts/cfg/engtools.conf +++ b/tools/engtools/hostdata-collectors/scripts/cfg/engtools.conf @@ -15,7 +15,7 @@ DURATION= [StaticCollection] # Set this option to Y/N before patch creation to enable/disable static stats collection -ENABLE_STATIC_COLLECTION=Y +ENABLE_STATIC_COLLECTION=N [CollectInternal] # controller external OAM interface used to communicate with remote server. If unset, the first interface from ifconfig will be used @@ -46,7 +46,7 @@ filestats=30 netstats=10 postgres=30 rabbitmq=3600 -vswitch=30 +vswitch=120 [AdditionalOptions] # Set this option to Y/N to enable/disable Openstack API GET/POST collection @@ -62,10 +62,10 @@ FAST_POSTGRES_CONNECTIONS=N AUTO_DELETE_DB=N [ControllerServices] -CONTROLLER_SERVICE_LIST=aodh-api aodh-listener aodh-notifier aodh-evaluator beam.smp ceilometer-api ceilometer-collector ceilometer-agent-notification ceilometer-mem-db ceph-mon ceph-rest-api ceph-alarm-manager cinder-api cinder-volume cinder-scheduler glance-api glance-registry heat-api heat-engine heat-api-cfn heat-api-cloudwatch hbsAgent ironic-api ironic-conductor keystone-all magnum-api magnum-conductor neutron-server nova-api nova-api-proxy nova-compute nova-scheduler nova-conductor nova-console-auth nova-novncproxy nova-placement-api panko-api sysinv-api sysinv-conductor postgres fmManager rabbitmq-server gunicorn postgres snmpd patch-alarm-manager lighttpd sw-patch-controller-daemon nfv-vim nfv-vim-api nfv-vim-webserver slapd mtcAgent guestAgent +CONTROLLER_SERVICE_LIST=aodh-api aodh-listener aodh-notifier aodh-evaluator beam.smp ceilometer-api ceilometer-collector ceilometer-agent-notification ceilometer-mem-db ceph-mon ceph-rest-api ceph-alarm-manager cinder-api cinder-volume cinder-scheduler glance-api glance-registry gnocchi-api gnocchi-metricd heat-api heat-engine heat-api-cfn heat-api-cloudwatch hbsAgent ironic-api ironic-conductor magnum-api magnum-conductor neutron-server nova-api nova-api-proxy nova-compute nova-scheduler nova-conductor nova-console-auth nova-novncproxy nova-placement-api panko-api sysinv-api sysinv-conductor postgres fmManager rabbitmq-server gunicorn postgres snmpd patch-alarm-manager lighttpd sw-patch-controller-daemon nfv-vim nfv-vim-api nfv-vim-webserver slapd mtcAgent guestAgent dcmanager-api dcmanager-manager dcorch-engine dcorch-neutron-api-proxy dcorch-nova-api-proxy dcorch-patch-api-proxy dcorch-snmp dcorch-sysinv-api-proxy memcached influxd [ComputeServices] -COMPUTE_SERVICE_LIST=nova-compute neutron-dhcp-agent neutron-metadata-agent neutron-sriov-nic-agent kvm libvirtd guestServer host_agent +COMPUTE_SERVICE_LIST=nova-compute neutron-dhcp-agent neutron-metadata-agent neutron-sriov-nic-agent kvm libvirtd guestServer host_agent dmeventd virtlockd [StorageServices] STORAGE_SERVICE_LIST=ceph-mon ceph-osd ceph-manager ceph-rest-api @@ -74,4 +74,4 @@ STORAGE_SERVICE_LIST=ceph-mon ceph-osd ceph-manager ceph-rest-api RABBITMQ_QUEUE_LIST=notifications.info versioned_notifications.info [CommonServices] -COMMON_SERVICE_LIST=dnsmasq ceilometer-polling haproxy hwmond pmond rmond fsmond sw-patch-agent sysinv-agent syslog-ng hostwd iscsid io-monitor-manager acpid hbsClient logmgmt mtcClient mtcalarmd mtclogd sshd ntpd smartd sm sm-eru sm-watchdog sm-api ceilometer keyring cinder-rtstool +COMMON_SERVICE_LIST=dnsmasq ceilometer-polling haproxy hwmond pmond rmond fsmond sw-patch-agent sysinv-agent syslog-ng hostwd iscsid io-monitor-manager acpid hbsClient logmgmt mtcClient mtcalarmd mtclogd sshd ntpd smartd sm sm-eru sm-watchdog sm-api ceilometer keyring cinder-rtstool tuned polkitd lldpd IPaddr2 dnsmasq systemd-udevd systemd-journald logrotate collectd diff --git a/tools/engtools/hostdata-collectors/scripts/collect-engtools.service b/tools/engtools/hostdata-collectors/scripts/collect-engtools.service index 9a68b2a31..e00e1cd65 100644 --- a/tools/engtools/hostdata-collectors/scripts/collect-engtools.service +++ b/tools/engtools/hostdata-collectors/scripts/collect-engtools.service @@ -1,6 +1,7 @@ [Unit] Description=Engineering data collection tools to monitor host performance -After=network.service +Requires=network.service +After=network.service getty.target [Service] Type=forking diff --git a/tools/engtools/hostdata-collectors/scripts/collect-engtools.sh b/tools/engtools/hostdata-collectors/scripts/collect-engtools.sh index 908c2b762..7c1887a13 100644 --- a/tools/engtools/hostdata-collectors/scripts/collect-engtools.sh +++ b/tools/engtools/hostdata-collectors/scripts/collect-engtools.sh @@ -270,14 +270,10 @@ OPT_USE_INTERVALS=0 BINDIR=/usr/bin LBINDIR=/usr/local/bin -while IFS='' read -r line || [[ -n "$line" ]]; do - if [[ $line =~ 'ENABLE_STATIC_COLLECTION'* ]]; then - static_collection=${line:25:1} - fi -done < /etc/engtools/engtools.conf +. /etc/engtools/engtools.conf declare -a tlist -if [[ $static_collection == "Y" ]] || [[ $static_collection == "y" ]]; then +if [[ ${ENABLE_STATIC_COLLECTION} == "Y" ]] || [[ ${ENABLE_STATIC_COLLECTION} == "y" ]]; then tlist+=( "tool=${LBINDIR}/top.sh name=top period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" ) tlist+=( "tool=${LBINDIR}/iostat.sh name=iostat period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" ) tlist+=( "tool=${LBINDIR}/netstats.sh name=netstats period=${PERIOD_MIN} interval=${netstats_interval}" ) @@ -290,45 +286,55 @@ if [[ $static_collection == "Y" ]] || [[ $static_collection == "y" ]]; then if [[ "${HOSTNAME}" =~ "controller-" ]]; then tlist+=( "tool=${LBINDIR}/ceph.sh name=ceph period=${PERIOD_MIN} interval=${ceph_interval}" ) tlist+=( "tool=${LBINDIR}/postgres.sh name=postgres period=${PERIOD_MIN} interval=${postgres_interval}" ) - # tlist+=( "tool=${LBINDIR}/rabbitmq.sh name=rabbitmq period=${PERIOD_MIN} interval=${rabbitmq_interval}" ) + tlist+=( "tool=${LBINDIR}/rabbitmq.sh name=rabbitmq period=${PERIOD_MIN} interval=${rabbitmq_interval}" ) elif [[ "${HOSTNAME}" =~ "compute-" ]]; then tlist+=( "tool=${LBINDIR}/vswitch.sh name=vswitch period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" ) fi + + # ticker - shows progress on the screen + tlist+=( "tool=${LBINDIR}/ticker.sh name= period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" ) fi -# ticker - shows progress on the screen -tlist+=( "tool=${LBINDIR}/ticker.sh name= period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" ) - +if [[ ${ENABLE_LIVE_STREAM} == "Y" ]] || [[ ${ENABLE_LIVE_STREAM} == "y" ]]; then + ${TOOLBIN}/live_stream.py & +fi #------------------------------------------------------------------------------- # Main loop #------------------------------------------------------------------------------- OPT_DEBUG=0 REP=0 -while [[ ${TOOL_USR1_SIGNAL} -eq 0 ]] && - [[ ${OPT_FOREVER} -eq 1 || ${REP} -lt ${REPEATS} ]] -do - # increment loop counter - ((REP++)) - # purge oldest files - purge_oldest_files +if [ ${#tlist[@]} -ne 0 ]; then + # Static stats collection is turned on + while [[ ${TOOL_USR1_SIGNAL} -eq 0 ]] && + [[ ${OPT_FOREVER} -eq 1 || ${REP} -lt ${REPEATS} ]] + do + # increment loop counter + ((REP++)) - # define filename timestamp - timestamp=$( date +"%Y-%0m-%0e_%H%M" ) + # purge oldest files + purge_oldest_files - # collect tools in parallel to separate output files - LOG "collecting ${TOOLNAME} at ${timestamp} for ${PERIOD_MIN} mins, repeat=${REP}" - do_parallel_commands + # define filename timestamp + timestamp=$( date +"%Y-%0m-%0e_%H%M" ) + + # collect tools in parallel to separate output files + LOG "collecting ${TOOLNAME} at ${timestamp} for ${PERIOD_MIN} mins, repeat=${REP}" + do_parallel_commands + wait + + # Compress latest increment + LOG "compressing: ${parallel_outfiles[@]}" + ${CMD_IDLE} bzip2 -q -f ${parallel_outfiles[@]} 2>/dev/null & + done + + # Wait for the compression to complete wait + tools_cleanup 0 +fi - # Compress latest increment - LOG "compressing: ${parallel_outfiles[@]}" - ${CMD_IDLE} bzip2 -q -f ${parallel_outfiles[@]} 2>/dev/null & -done - -# wait for compression to complete +# Should wait here in case live stats streaming is turned on. wait -tools_cleanup 0 exit 0 diff --git a/tools/engtools/hostdata-collectors/scripts/init.d/collect-engtools.sh b/tools/engtools/hostdata-collectors/scripts/init.d/collect-engtools.sh index 5bc7b6b2d..6712e7976 100644 --- a/tools/engtools/hostdata-collectors/scripts/init.d/collect-engtools.sh +++ b/tools/engtools/hostdata-collectors/scripts/init.d/collect-engtools.sh @@ -60,7 +60,6 @@ case $1 in log_daemon_msg "Starting ${NAME}" if start-stop-daemon --start --background --quiet --oknodo --pidfile ${PIDFILE} \ --exec ${DAEMON} -- ${DAEMON_ARGS} ; then - ./usr/local/bin/live_stream.py & log_end_msg 0 else log_end_msg 1 diff --git a/tools/engtools/hostdata-collectors/scripts/live_stream.py b/tools/engtools/hostdata-collectors/scripts/live_stream.py index 8192048d7..d96773d39 100644 --- a/tools/engtools/hostdata-collectors/scripts/live_stream.py +++ b/tools/engtools/hostdata-collectors/scripts/live_stream.py @@ -167,6 +167,10 @@ def collectMemstats(influx_info, node, ci, services, syseng_services, openstack_ fields[gsvc]["vsz"] += vsz elif svc == "postgres": + if (len(line) <= i+2): + # Command line could be "sudo su postgres", skip it + break + if line[i + 1].startswith("-") is False and line[i + 1].startswith("_") is False and line[i + 1] != "psql": psvc = "" if line[i + 2] in openstack_services: @@ -284,6 +288,10 @@ def collectSchedtop(influx_info, node, ci, services, syseng_services, openstack_ fields[gsvc] += occ elif svc == "postgres": + if (len(line) <= i+2): + # Command line could be "sudo su postgres", skip it + break + if line[i + 1].startswith("-") is False and line[i + 1].startswith("_") is False and line[i + 1] != "psql": psvc = "" if line[i + 2] in openstack_services: @@ -589,20 +597,22 @@ def collectPostgres(influx_info, node, ci): postgres_output = postgres_output1 = None influx_string = influx_string1 = "" good_string = False + dbcount = 0 + BATCH_SIZE = 10 + while True: try: # make sure this is active controller, otherwise postgres queries wont work if isActiveController(): while True: - # get list of databases and their sizes postgres_output = Popen("sudo -u postgres psql --pset pager=off -q -t -c'SELECT datname, pg_database_size(datname) FROM pg_database WHERE datistemplate = false;'", shell=True, stdout=PIPE) - lines = postgres_output.stdout.read().replace(" ", "").strip().split("\n") - if lines == "" or lines is None: + db_lines = postgres_output.stdout.read().replace(" ", "").strip().split("\n") + if db_lines == "" or db_lines is None: postgres_output.kill() break else: # for each database from the previous output - for line in lines: + for line in db_lines: if not line: break line = line.replace(" ", "").split("|") @@ -613,8 +623,8 @@ def collectPostgres(influx_info, node, ci): # get tables for each database sql = "SELECT table_schema,table_name,pg_size_pretty(table_size) AS table_size,pg_size_pretty(indexes_size) AS indexes_size,pg_size_pretty(total_size) AS total_size,live_tuples,dead_tuples FROM (SELECT table_schema,table_name,pg_table_size(table_name) AS table_size,pg_indexes_size(table_name) AS indexes_size,pg_total_relation_size(table_name) AS total_size,pg_stat_get_live_tuples(table_name::regclass) AS live_tuples,pg_stat_get_dead_tuples(table_name::regclass) AS dead_tuples FROM (SELECT table_schema,table_name FROM information_schema.tables WHERE table_schema='public' AND table_type='BASE TABLE') AS all_tables ORDER BY total_size DESC) AS pretty_sizes;" postgres_output1 = Popen('sudo -u postgres psql --pset pager=off -q -t -d{} -c"{}"'.format(line[0], sql), shell=True, stdout=PIPE) - lines = postgres_output1.stdout.read().replace(" ", "").strip().split("\n") - for line in lines: + tbl_lines = postgres_output1.stdout.read().replace(" ", "").strip().split("\n") + for line in tbl_lines: if line == "": continue else: @@ -648,6 +658,13 @@ def collectPostgres(influx_info, node, ci): fields1["dead_tuples"] = int(elements[6]) influx_string1 += "{},'{}'='{}','{}'='{}','{}'='{}','{}'='{}' '{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}'".format(measurement1, "node", tags["node"], "service", tags["service"], "table_schema", tags["table_schema"], "table", tags["table"], "table_size", fields1["table_size"], "index_size", fields1["index_size"], "total_size", fields1["total_size"], "live_tuples", fields1["live_tuples"], "dead_tuples", fields1["dead_tuples"]) + "\n" good_string = True + dbcount += 1 + if dbcount == BATCH_SIZE and good_string: + # Curl will barf if the batch is too large + p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string1), shell=True) + p.communicate() + influx_string1 = "" + dbcount = 0 if good_string: # send table data to InfluxDB p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string), shell=True) @@ -655,6 +672,7 @@ def collectPostgres(influx_info, node, ci): p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string1), shell=True) p.communicate() influx_string = influx_string1 = "" + dbcount = 0 time.sleep(ci["postgres"]) postgres_output1.kill() postgres_output.kill() @@ -1331,7 +1349,7 @@ if __name__ == "__main__": live_svc = ("live_stream.py",) static_svcs = ("occtop", "memtop", "schedtop", "top.sh", "iostat.sh", "netstats.sh", "diskstats.sh", "memstats.sh", "filestats.sh", "ceph.sh", "postgres.sh", "rabbitmq.sh", "vswitch.sh") collection_intervals = {"memtop": None, "memstats": None, "occtop": None, "schedtop": None, "load_avg": None, "cpu_count": None, "diskstats": None, "iostat": None, "filestats": None, "netstats": None, "postgres": None, "rabbitmq": None, "vswitch": None} - openstack_services = ("nova", "cinder", "aodh", "ceilometer", "heat", "glance", "ceph", "horizon", "keystone", "puppet", "sysinv", "neutron", "nova_api", "postgres") + openstack_services = ("nova", "cinder", "aodh", "ceilometer", "heat", "glance", "ceph", "horizon", "keystone", "puppet", "sysinv", "neutron", "nova_api", "postgres", "panko", "nova_cell0", "magnum", "ironic", "murano", "gnocchi") # memstats, schedtop, and filestats must skip/exclude certain fields when collect_all is enabled. No need to collect this stuff exclude_list = ("python", "python2", "bash", "perl", "sudo", "init") skip_list = ("ps", "top", "sh", "", "curl", "awk", "wc", "sleep", "lsof", "cut", "grep", "ip", "tail", "su")