diff --git a/centos_pkg_dirs b/centos_pkg_dirs index c1c04cce8..b64cecb00 100644 --- a/centos_pkg_dirs +++ b/centos_pkg_dirs @@ -115,3 +115,4 @@ tools/collector grub/grubby utilities/platform-util tools/monitor-tools +tools/engtools/hostdata-collectors diff --git a/tools/engtools/hostdata-collectors/README b/tools/engtools/hostdata-collectors/README new file mode 100644 index 000000000..a5174af41 --- /dev/null +++ b/tools/engtools/hostdata-collectors/README @@ -0,0 +1,12 @@ +The Engineering tools is meant to be installed as a patch. Therefore, the RPM is generated as part +of the build but is not included in the image. Assuming your development environment is fully set up, +simply run patch-engtools.sh to generate the patch: + +In this directory ($MY_REPO/addons/wr-cgcs/layers/cgcs/middleware/util/recipes-common/engtools/hostdata-collectors), +enter the command: +>./patch-engtools.sh + +This generates ENGTOOLS-X.patch (X is Tis release version) which can be applied via sw-patch. + +The patch is built with --all-nodes option by default. This can be changed to a combination of the following: +--controller, --compute, --storage, --controller-compute, and --compute-lowlatency. diff --git a/tools/engtools/hostdata-collectors/centos/build_srpm.data b/tools/engtools/hostdata-collectors/centos/build_srpm.data new file mode 100644 index 000000000..81d405878 --- /dev/null +++ b/tools/engtools/hostdata-collectors/centos/build_srpm.data @@ -0,0 +1,2 @@ +SRC_DIR="scripts" +TIS_PATCH_VER=1 diff --git a/tools/engtools/hostdata-collectors/centos/collect-engtools.spec b/tools/engtools/hostdata-collectors/centos/collect-engtools.spec new file mode 100644 index 000000000..91f2bb426 --- /dev/null +++ b/tools/engtools/hostdata-collectors/centos/collect-engtools.spec @@ -0,0 +1,101 @@ +Summary: Host performance data collection tools package +Name: engtools +Version: 1.0 +Release: %{tis_patch_ver}%{?_tis_dist} +License: Apache-2.0 +Group: Tools +Packager: Wind River +URL: http://www.windriver.com/ +BuildArch: noarch +Source: %{name}-%{version}.tar.gz + +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root + +Requires: iperf3 + +%description +This package contains data collection tools to monitor host performance. +Tools are general purpose engineering and debugging related. Includes +overall memory, cpu occupancy, per-task cpu, per-task scheduling, per-task +io. + +# Don't try fancy stuff like debuginfo, which is useless on binary-only +# packages. Don't strip binary too +# Be sure buildpolicy set to do nothing +%define __spec_install_post %{nil} +%define debug_package %{nil} +%define __os_install_post %{_dbpath}/brp-compress +%define _binaries_in_noarch_packages_terminate_build 0 + +%define local_dir /usr/local +%define local_bindir %{local_dir}/bin/ +%define local_initdir /etc/init.d/ +%define local_confdir /etc/engtools/ +%define local_systemddir /etc/systemd/system/ + +%prep +%setup -q + +%build +# Empty section. + +%install +mkdir -p %{buildroot} +install -d 755 %{buildroot}%{local_bindir} +# Installing additional tools, memtop, occtop and schedtop are already in the image +install -m 755 buddyinfo.py %{buildroot}%{local_bindir} +install -m 755 chewmem %{buildroot}%{local_bindir} +# Installing data collection scripts +install -m 755 ceph.sh %{buildroot}%{local_bindir} +install -m 755 cleanup-engtools.sh %{buildroot}%{local_bindir} +install -m 755 collect-engtools.sh %{buildroot}%{local_bindir} +install -m 755 diskstats.sh %{buildroot}%{local_bindir} +install -m 755 engtools_util.sh %{buildroot}%{local_bindir} +install -m 755 filestats.sh %{buildroot}%{local_bindir} +install -m 755 iostat.sh %{buildroot}%{local_bindir} +install -m 755 linux_benchmark.sh %{buildroot}%{local_bindir} +install -m 755 memstats.sh %{buildroot}%{local_bindir} +install -m 755 netstats.sh %{buildroot}%{local_bindir} +install -m 755 postgres.sh %{buildroot}%{local_bindir} +install -m 755 rabbitmq.sh %{buildroot}%{local_bindir} +install -m 755 remote/rbzip2-engtools.sh %{buildroot}%{local_bindir} +install -m 755 remote/rstart-engtools.sh %{buildroot}%{local_bindir} +install -m 755 remote/rstop-engtools.sh %{buildroot}%{local_bindir} +install -m 755 remote/rsync-engtools-data.sh %{buildroot}%{local_bindir} +install -m 755 slab.sh %{buildroot}%{local_bindir} +install -m 755 ticker.sh %{buildroot}%{local_bindir} +install -m 755 top.sh %{buildroot}%{local_bindir} +install -m 755 vswitch.sh %{buildroot}%{local_bindir} +install -m 755 live_stream.py %{buildroot}%{local_bindir} +# Installing conf file +install -d 755 %{buildroot}%{local_confdir} +install -m 644 -p -D cfg/engtools.conf %{buildroot}%{local_confdir} +# Installing init script +install -d 755 %{buildroot}%{local_initdir} +install -m 755 init.d/collect-engtools.sh %{buildroot}%{local_initdir} +# Installing service file +install -d 755 %{buildroot}%{local_systemddir} +install -m 644 -p -D collect-engtools.service %{buildroot}%{local_systemddir} + +%clean +rm -rf $RPM_BUILD_ROOT + +%files +%license LICENSE +%defattr(-,root,root,-) +%{local_bindir}/* +%{local_confdir}/* +%{local_initdir}/* +%{local_systemddir}/* + +%post +/bin/systemctl enable collect-engtools.service > /dev/null 2>&1 +/bin/systemctl start collect-engtools.service > /dev/null 2>&1 + +%preun +#/bin/systemctl --no-reload disable collect-engtools.sh.service > /dev/null 2>&1 +#/bin/systemctl stop collect-engtools.sh.service > /dev/null 2>&1 +%systemd_preun collect-engtools.service + +%postun +%systemd_postun_with_restart collect-engtools.service diff --git a/tools/engtools/hostdata-collectors/patch-engtools.sh b/tools/engtools/hostdata-collectors/patch-engtools.sh new file mode 100755 index 000000000..bf9a73d4d --- /dev/null +++ b/tools/engtools/hostdata-collectors/patch-engtools.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# Designer patches: +# http://twiki.wrs.com/PBUeng/Patching + +if [ -z $MY_WORKSPACE ] || [ -z $MY_REPO ]; then + echo "Some dev environment variables are not set." + echo "Refer to http://wiki.wrs.com/PBUeng/CentOSBuildProcess for instructions." + exit 1 +fi + +ENGTOOLS=$(ls ${MY_WORKSPACE}/std/rpmbuild/RPMS/engtools*noarch.rpm 2>/dev/null) +if [ $? -ne 0 ]; then + echo "Engtools RPM has not been built. Please run \"build-pkgs engtools\" first." + exit 1 +fi + +source ${MY_REPO}/addons/wr-cgcs/layers/cgcs/middleware/recipes-common/build-info/release-info.inc +#TiS_REL="16.10" +#PATCH_ID="ENGTOOLS-${TiS_REL}" +PATCH_ID="ENGTOOLS-${PLATFORM_RELEASE}" + +PWD=$(pwd) + +# Create CGCS Patch +cd ${MY_WORKSPACE} +PATCH_BUILD=${MY_REPO}/addons/wr-cgcs/layers/cgcs/extras.ND/scripts/patch_build.sh +${PATCH_BUILD} --id ${PATCH_ID} --reboot-required=N \ + --summary "System engineering data collection and analysis tools." \ + --desc "System engineering data collection and analysis tools." \ + --all-nodes ${ENGTOOLS} \ + --warn "Intended for system engineering use only." +cd ${PWD} +exit 0 diff --git a/tools/engtools/hostdata-collectors/scripts/LICENSE b/tools/engtools/hostdata-collectors/scripts/LICENSE new file mode 100644 index 000000000..d64569567 --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/tools/engtools/hostdata-collectors/scripts/buddyinfo.py b/tools/engtools/hostdata-collectors/scripts/buddyinfo.py new file mode 100644 index 000000000..2ccfd99f1 --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/buddyinfo.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python +# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 textwidth=79 autoindent + +""" +Python source code +Last modified: 15 Feb 2014 - 13:38 +Last author: lmwangi at gmail com +Displays the available memory fragments +by querying /proc/buddyinfo +Example: +# python buddyinfo.py +""" +import optparse +import os +import re +from collections import defaultdict +import logging + + +class Logger: + def __init__(self, log_level): + self.log_level = log_level + + def get_formatter(self): + return logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + + def get_handler(self): + return logging.StreamHandler() + + def get_logger(self): + """Returns a Logger instance for the specified module_name""" + logger = logging.getLogger('main') + logger.setLevel(self.log_level) + log_handler = self.get_handler() + log_handler.setFormatter(self.get_formatter()) + logger.addHandler(log_handler) + return logger + + +class BuddyInfo(object): + """BuddyInfo DAO""" + def __init__(self, logger): + super(BuddyInfo, self).__init__() + self.log = logger + self.buddyinfo = self.load_buddyinfo() + + def parse_line(self, line): + line = line.strip() + self.log.debug("Parsing line: %s" % line) + parsed_line = re.match("Node\s+(?P\d+).*zone\s+(?P\w+)\s+(?P.*)", line).groupdict() + self.log.debug("Parsed line: %s" % parsed_line) + return parsed_line + + def read_buddyinfo(self): + buddyhash = defaultdict(list) + buddyinfo = open("/proc/buddyinfo").readlines() + for line in map(self.parse_line, buddyinfo): + numa_node = int(line["numa_node"]) + zone = line["zone"] + free_fragments = map(int, line["nr_free"].split()) + max_order = len(free_fragments) + fragment_sizes = self.get_order_sizes(max_order) + usage_in_bytes = [block[0] * block[1] for block in zip(free_fragments, fragment_sizes)] + buddyhash[numa_node].append({ + "zone": zone, + "nr_free": free_fragments, + "sz_fragment": fragment_sizes, + "usage": usage_in_bytes }) + return buddyhash + + def load_buddyinfo(self): + buddyhash = self.read_buddyinfo() + self.log.info(buddyhash) + return buddyhash + + def page_size(self): + return os.sysconf("SC_PAGE_SIZE") + + def get_order_sizes(self, max_order): + return [self.page_size() * 2**order for order in range(0, max_order)] + + def __str__(self): + ret_string = "" + width = 20 + for node in self.buddyinfo: + ret_string += "Node: %s\n" % node + for zoneinfo in self.buddyinfo.get(node): + ret_string += " Zone: %s\n" % zoneinfo.get("zone") + ret_string += " Free KiB in zone: %.2f\n" % (sum(zoneinfo.get("usage")) / (1024.0)) + ret_string += '\t{0:{align}{width}} {1:{align}{width}} {2:{align}{width}}\n'.format( + "Fragment size", "Free fragments", "Total available KiB", + width=width, + align="<") + for idx in range(len(zoneinfo.get("sz_fragment"))): + ret_string += '\t{order:{align}{width}} {nr:{align}{width}} {usage:{align}{width}}\n'.format( + width=width, + align="<", + order = zoneinfo.get("sz_fragment")[idx], + nr = zoneinfo.get("nr_free")[idx], + usage = zoneinfo.get("usage")[idx] / 1024.0) + + return ret_string + +def main(): + """Main function. Called when this file is a shell script""" + usage = "usage: %prog [options]" + parser = optparse.OptionParser(usage) + parser.add_option("-s", "--size", dest="size", choices=["B","K","M"], + action="store", type="choice", help="Return results in bytes, kib, mib") + + (options, args) = parser.parse_args() + logger = Logger(logging.DEBUG).get_logger() + logger.info("Starting....") + logger.info("Parsed options: %s" % options) + print logger + buddy = BuddyInfo(logger) + print buddy + +if __name__ == '__main__': + main() + diff --git a/tools/engtools/hostdata-collectors/scripts/ceph.sh b/tools/engtools/hostdata-collectors/scripts/ceph.sh new file mode 100644 index 000000000..40014406f --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/ceph.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# Usage: ceph.sh [-p ] [-i ] [-c ] [-h] +TOOLBIN=$(dirname $0) + +# Initialize tools environment variables, and define common utility functions +. ${TOOLBIN}/engtools_util.sh +tools_init +if [ $? -ne 0 ]; then + echo "FATAL, tools_init - could not setup environment" + exit $? +fi + +# Enable use of INTERVAL_SEC sample interval +OPT_USE_INTERVALS=1 + +# Print key ceph statistics +function print_ceph() +{ + print_separator + TOOL_HIRES_TIME + + cmd='ceph -s' + ${ECHO} "# ${cmd}" ; ${cmd} ; ${ECHO} + + cmd='ceph osd tree' + ${ECHO} "# ${cmd}" ; ${cmd} ; ${ECHO} + + cmd='ceph df detail' + ${ECHO} "# ${cmd}" ; ${cmd} ; ${ECHO} +} + +#------------------------------------------------------------------------------- +# MAIN Program: +#------------------------------------------------------------------------------- +# Parse input options +tools_parse_options "${@}" + +# Set affinity of current script +CPULIST="" +set_affinity ${CPULIST} + +LOG "collecting ${TOOLNAME} for ${PERIOD_MIN} minutes, with ${INTERVAL_SEC} second sample intervals." + +# Print tools generic tools header +tools_header + +# Calculate number of sample repeats based on overall interval and sampling interval +((REPEATS = PERIOD_MIN * 60 / INTERVAL_SEC)) + +for ((rep=1; rep <= REPEATS ; rep++)) +do + print_ceph + sleep ${INTERVAL_SEC} +done +print_ceph +LOG "done" + +# normal program exit +tools_cleanup 0 +exit 0 diff --git a/tools/engtools/hostdata-collectors/scripts/cfg/engtools.conf b/tools/engtools/hostdata-collectors/scripts/cfg/engtools.conf new file mode 100644 index 000000000..b2b940da7 --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/cfg/engtools.conf @@ -0,0 +1,77 @@ +# engtools configuration + +# You may comment out any unwanted fields under the Intervals section, but do not comment out any other configuration options as the python parsing utility will complain. Please follow the comments + +[LabConfiguration] +# Set this option to Y/N depending on the setup of your lab +CPE_LAB=N + +[LiveStream] +# Set this option to Y/N before patch creation to enable/disable live stats collection +ENABLE_LIVE_STREAM=Y + +# Set the duration of the live stream capture utility. Leave blank for continuous collection. Ex: 1s,1m,1h,1d +DURATION= + +[StaticCollection] +# Set this option to Y/N before patch creation to enable/disable static stats collection +ENABLE_STATIC_COLLECTION=Y + +[CollectInternal] +# controller external OAM interface used to communicate with remote server. If unset, the first interface from ifconfig will be used +CONTROLLER0_EXTERNAL_INTERFACE= +CONTROLLER1_EXTERNAL_INTERFACE= + +[RemoteServer] +# remote server influx and grafana info +INFLUX_IP=128.224.186.61 +INFLUX_PORT=8086 +INFLUX_DB= +GRAFANA_PORT=3000 + +# This key is created through Grafana. If deleted, a new key (with admin privileges) must be created and copied here +GRAFANA_API_KEY=eyJrIjoiSkR1SXcxbkVVckd1dW9PMHFKS0EzQ2hQWTd1YUhtSkIiLCJuIjoiZGJfY3JlYXRvciIsImlkIjoxfQ== + +[Intervals] +# Set the collection interval (in seconds) to be used in the live_stream.py script. If unset or commented out, that field will not be collected +memtop=10 +memstats=10 +occtop=10 +schedtop=10 +load_avg=3 +cpu_count=60 +diskstats=30 +iostat=10 +filestats=30 +netstats=10 +postgres=30 +rabbitmq=3600 +vswitch=30 + +[AdditionalOptions] +# Set this option to Y/N to enable/disable Openstack API GET/POST collection +API_REQUESTS=N + +# Set this option to Y/N to enable/disable the collection of all services and not just the ones listed below. Note that this hasn't been tested thoroughly +ALL_SERVICES=N + +# Set this option to Y/N to enable/disable fast postgres connections collection. By default, postgres connections use the same collection interval as postgres DB size (set above), this option will set the collection interval to 0 seconds while not affecting the above postgres collection interval +FAST_POSTGRES_CONNECTIONS=N + +# Set this option to Y/N to enable/disable automatic database deletion for InfluxDB and Grafana. As of now, this feature does not work with the engtools patch +AUTO_DELETE_DB=N + +[ControllerServices] +CONTROLLER_SERVICE_LIST=aodh-api aodh-listener aodh-notifier aodh-evaluator beam.smp ceilometer-api ceilometer-collector ceilometer-agent-notification ceilometer-mem-db ceph-mon ceph-rest-api ceph-alarm-manager cinder-api cinder-volume cinder-scheduler glance-api glance-registry heat-api heat-engine heat-api-cfn heat-api-cloudwatch hbsAgent ironic-api ironic-conductor keystone-all magnum-api magnum-conductor neutron-server nova-api nova-api-proxy nova-compute nova-scheduler nova-conductor nova-console-auth nova-novncproxy nova-placement-api panko-api sysinv-api sysinv-conductor postgres fmManager rabbitmq-server gunicorn postgres snmpd patch-alarm-manager lighttpd sw-patch-controller-daemon nfv-vim nfv-vim-api nfv-vim-webserver slapd mtcAgent guestAgent + +[ComputeServices] +COMPUTE_SERVICE_LIST=nova-compute neutron-dhcp-agent neutron-metadata-agent neutron-sriov-nic-agent kvm libvirtd guestServer host_agent + +[StorageServices] +STORAGE_SERVICE_LIST=ceph-mon ceph-osd ceph-manager ceph-rest-api + +[RabbitmqServices] +RABBITMQ_QUEUE_LIST=notifications.info versioned_notifications.info + +[CommonServices] +COMMON_SERVICE_LIST=dnsmasq ceilometer-polling haproxy hwmond pmond rmond fsmond sw-patch-agent sysinv-agent syslog-ng hostwd iscsid io-monitor-manager acpid hbsClient logmgmt mtcClient mtcalarmd mtclogd sshd ntpd smartd sm sm-eru sm-watchdog sm-api ceilometer keyring cinder-rtstool diff --git a/tools/engtools/hostdata-collectors/scripts/chewmem b/tools/engtools/hostdata-collectors/scripts/chewmem new file mode 100644 index 000000000..03ed3d8a2 --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/chewmem @@ -0,0 +1,86 @@ +#!/usr/bin/perl +# Usage: +# ./chewmem.pl + +# Description: +# This will create a character array requiring "MiB" actual memory. +# Summarize high-level memory usage. + +# Ideally we can demonstate creating larger and larger +# successful memory allocations until Avail is near 0. +# It is very likely to trigger OOM Killer or cause reset +# if we run completely out of memory. + +use warnings; +use strict; +use POSIX qw(strftime); + +sub show_memusage() { + our $count; + $::count++; $::count %= 15; + + my $Ki = 1024.0; + my ($MemTotal, $MemFree, $Buffers, $Cached, $CommitLimit, $Committed_AS, $Slab, $SReclaimable); + # Process all entries of MEMINFO + my $file = '/proc/meminfo'; + open(FILE, $file) || die "Cannot open file: $file ($!)"; + MEMINFO_LOOP: while($_ = ) { + s/[\0\e\f\r\a]//g; chomp; # strip control characters if any + last MEMINFO_LOOP if (/^\s*$/); # end at blank-line + if (/\bMemTotal:\s+(\d+)\s+kB/) { + $MemTotal = $1; next MEMINFO_LOOP; + } + if (/\bMemFree:\s+(\d+)\s+kB/) { + $MemFree = $1; next MEMINFO_LOOP; + } + if (/\bBuffers:\s+(\d+)\s+kB/) { + $Buffers = $1; next MEMINFO_LOOP; + } + if (/\bCached:\s+(\d+)\s+kB/) { + $Cached = $1; next MEMINFO_LOOP; + } + if (/\bCommitLimit:\s+(\d+)\s+kB/) { + $CommitLimit = $1; next MEMINFO_LOOP; + } + if (/\bCommitted_AS:\s+(\d+)\s+kB/) { + $Committed_AS = $1; next MEMINFO_LOOP; + } + if (/\bSlab:\s+(\d+)\s+kB/) { + $Slab = $1; next MEMINFO_LOOP; + } + if (/\bSReclaimable:\s+(\d+)\s+kB/) { + $SReclaimable = $1; next MEMINFO_LOOP; + } + } + close(FILE); + + my $Avail_MiB = ($MemFree + $Cached + $Buffers + $SReclaimable)/$Ki; + my $Strict_MiB = ($CommitLimit - $Committed_AS)/$Ki; + my $now = strftime "%Y-%m-%d %H:%M:%S", localtime(); + if ($::count == 1) { + printf "%19s %6s %6s %6s %6s %6s %6s %6s %6s %6s\n", + 'yyyy-mm-dd hh:mm:ss', 'Tot', 'Free', 'Ca', 'Buf', 'Slab', 'CAS', 'CLim', 'Avail', 'Strict'; + } + printf "%19s %6.1f %6.1f %6.1f %6.1f %6.1f %6.1f %6.1f %6.1f %6.1f\n", + $now, $MemTotal/$Ki, $MemFree/$Ki, $Cached/$Ki, $Buffers/$Ki, $Slab/$Ki, + $Committed_AS/$Ki, $CommitLimit/$Ki, $Avail_MiB, $Strict_MiB; +} + +#------------------------------------------------------------------------------- +# MAIN PROGRAM +# Autoflush output +select(STDERR); +$| = 1; +select(STDOUT); # default +$| = 1; + +my $MiB = $ARGV[0] ||=0.0; +my $A = "A" x (1024*1024*$MiB/2); +print "Allocating $MiB MiB character array.\n"; +while(1) { + sleep(1); + show_memusage(); +} +exit 0; + +1; diff --git a/tools/engtools/hostdata-collectors/scripts/cleanup-engtools.sh b/tools/engtools/hostdata-collectors/scripts/cleanup-engtools.sh new file mode 100644 index 000000000..e1f16239d --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/cleanup-engtools.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# Purpose: +# Some of the engtools scripts are not shutting down gracefully. + +# Define common utility functions +TOOLBIN=$(dirname $0) +. ${TOOLBIN}/engtools_util.sh +if [ $UID -ne 0 ]; then + ERRLOG "Require sudo/root access." + exit 1 +fi + +declare -a TOOLS +TOOLS=() +TOOLS+=('collect-engtools.sh') +TOOLS+=('ceph.sh') +TOOLS+=('diskstats.sh') +TOOLS+=('iostat.sh') +TOOLS+=('rabbitmq.sh') +TOOLS+=('ticker.sh') +TOOLS+=('top.sh') +TOOLS+=('memstats.sh') +TOOLS+=('netstats.sh') +TOOLS+=('postgres.sh') +TOOLS+=('vswitch.sh') +TOOLS+=('filestats.sh') +TOOLS+=('live_stream.py') + +LOG "Cleanup engtools:" + +# Brute force methods (assume trouble with: service collect-engtools.sh stop) +# ( be sure not to clobber /etc/init.d/collect-engtools.sh ) +LOG "kill processes brute force" +pids=( $(pidof -x /usr/local/bin/collect-engtools.sh) ) +if [ ${#pids[@]} -ne 0 ] +then + LOG "killing: ${pids[@]}" + for pid in ${pids[@]} + do + LOG "kill: [ ${pid} ] " + pkill -KILL -P ${pid} + kill -9 ${pid} + done + pkill -KILL iostat + pkill -KILL top +else + LOG "no pids found" +fi + +LOG "remove pidfiles" +for TOOL in "${TOOLS[@]}" +do + rm -f -v /var/run/${TOOL}.pid +done +LOG "done" + +exit 0 diff --git a/tools/engtools/hostdata-collectors/scripts/collect-engtools.service b/tools/engtools/hostdata-collectors/scripts/collect-engtools.service new file mode 100644 index 000000000..9a68b2a31 --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/collect-engtools.service @@ -0,0 +1,14 @@ +[Unit] +Description=Engineering data collection tools to monitor host performance +After=network.service + +[Service] +Type=forking +ExecStart=/etc/init.d/collect-engtools.sh start +ExecStop=/etc/init.d/collect-engtools.sh stop +ExecReload=/etc/init.d/collect-engtools.sh reload +PIDFile=/var/run/collect-engtools.sh.pid +Restart=always + +[Install] +WantedBy=multi-user.target diff --git a/tools/engtools/hostdata-collectors/scripts/collect-engtools.sh b/tools/engtools/hostdata-collectors/scripts/collect-engtools.sh new file mode 100644 index 000000000..908c2b762 --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/collect-engtools.sh @@ -0,0 +1,334 @@ +#!/bin/bash +# Usage: +# collect-engtools.sh [-f] [-p ] [-i ] [-c ] [-h] + +# Define common utility functions +TOOLBIN=$(dirname $0) +. ${TOOLBIN}/engtools_util.sh + +# ENABLE DEBUG (0=disable, 1=enable) +OPT_DEBUG=0 + +# Set options for long soak (vs, shorter collection) +#OPT_SOAK=0 # long soak +OPT_SOAK=1 # few hour soak +#OPT_SOAK=2 # < hour soak + +# Define command to set nice + ionice +CMD_IDLE=$( cmd_idle_priority ) + +# Purge configuration options +# - how much data may be created per cycle +PURGE_HEADROOM_MB=100 +# - how much remaining space to leave +PURGE_HEADROOM_PERCENT=15 +# - maximum size of data collection +PURGE_MAXUSAGE_MB=1000 + +# Affine to pinned cores +AFFINE_PINNED=1 + +# Line-buffer stream output (instead of buffered) +STDBUF="stdbuf -oL" + +# Define some common durations +DUR_60MIN_IN_SEC=$[60*60] +DUR_30MIN_IN_SEC=$[30*60] +DUR_15MIN_IN_SEC=$[15*60] +DUR_10MIN_IN_SEC=$[10*60] +DUR_5MIN_IN_SEC=$[5*60] +DUR_1MIN_IN_SEC=$[1*60] + +# Global variables +declare -a parallel_outfiles +declare df_size_bytes +declare df_avail_bytes +declare du_used_bytes +declare tgt_avail_bytes +declare tgt_used_bytes + +# do_parallel_commands - launch parallel tools with separate output files +function do_parallel_commands() +{ + parallel_outfiles=() + for elem in "${tlist[@]}" + do + tool=""; period=""; repeat=""; interval="" + my_hash="elem[*]" + local ${!my_hash} + if [ ! -z "${name}" ]; then + fname="${TOOL_DEST_DIR}/${HOSTNAME}_${timestamp}_${name}" + parallel_outfiles+=( $fname ) + LOG "collecting ${tool}, ${interval} second intervals, to: ${fname}" + if [ ! -z "${period}" ]; then + ${STDBUF} ${tool} -p ${period} -i ${interval} > ${fname} 2>/dev/null & + elif [ ! -z "${repeat}" ]; then + ${STDBUF} ${tool} --repeat=${repeat} --delay=${interval} > ${fname} 2>/dev/null & + fi + else + # run without file output (eg., ticker) + ${STDBUF} ${tool} -p ${period} -i ${interval} 2>/dev/null & + fi + done +} + +# get_current_avail_usage() - get output destination file-system usage and +# availability. +# - updates: df_size_bytes, df_avail_bytes, du_used_bytes +function get_current_avail_usage() +{ + local -a df_arr_bytes=( $(df -P --block-size=1 ${TOOL_DEST_DIR} | awk 'NR==2 {print $2, $4}') ) + df_size_bytes=${df_arr_bytes[0]} + df_avail_bytes=${df_arr_bytes[1]} + du_used_bytes=$(du --block-size=1 ${TOOL_DEST_DIR} | awk 'NR==1 {print $1}') +} + +# purge_oldest_files() - remove oldest files based on file-system available space, +# and maximum collection size +function purge_oldest_files() +{ + # get current file-system usage + get_current_avail_usage + msg=$(printf "avail %d MB, headroom %d MB; used %d MB, max %d MB" \ + $[$df_avail_bytes/1024/1024] $[$tgt_avail_bytes/1024/1024] \ + $[$du_used_bytes/1024/1024] $[$tgt_used_bytes/1024/1024]) + LOG "usage: ${msg}" + + if [[ $df_avail_bytes -lt $tgt_avail_bytes ]] || \ + [[ $du_used_bytes -gt $tgt_used_bytes ]]; then + # wait for compression to complete + wait + + get_current_avail_usage + if [[ $df_avail_bytes -lt $tgt_avail_bytes ]]; then + msg=$(printf "purge: avail %d MB < target %d MB" \ + $[$df_avail_bytes/1024/1024] $[$tgt_avail_bytes/1024/1024] ) + LOG "purge: ${msg}" + fi + if [[ $du_used_bytes -gt $tgt_used_bytes ]]; then + msg=$(printf "purge: used %d MB > target %d MB" \ + $[$du_used_bytes/1024/1024] $[$tgt_used_bytes/1024/1024] ) + LOG "purge: ${msg}" + fi + else + return + fi + + # remove files in oldest time sorted order until we meet usage targets, + # incrementally updating usage as we remve files + for file in $( ls -rt ${TOOL_DEST_DIR}/${HOSTNAME}_* 2>/dev/null ) + do + if [[ $df_avail_bytes -ge $tgt_avail_bytes ]] && \ + [[ $du_used_bytes -le $tgt_used_bytes ]]; then + break + fi + + if [ ${OPT_DEBUG} -eq 1 ]; then + msg="purge: file=$file" + if [[ $df_avail_bytes -lt $tgt_avail_bytes ]]; then + msg="${msg}, < AVAIL" + fi + if [[ $du_used_bytes -gt $tgt_used_bytes ]]; then + msg="${msg}, > MAXUSAGE" + fi + LOG "${msg}" + fi + + sz_bytes=$(stat --printf="%s" $file) + ((df_avail_bytes += sz_bytes)) + ((du_used_bytes -= sz_bytes)) + rm -fv ${file} + done +} + +#------------------------------------------------------------------------------- +# MAIN Program: +#------------------------------------------------------------------------------- +# Read configuration variable file if it is present +NAME=collect-engtools.sh +[ -r /etc/default/$NAME ] && . /etc/default/$NAME + +# Initialize tool +tools_init + +# Parse input options +tools_parse_options "${@}" + +# Set affinity of current script +CPULIST="" + +# Affine tools to NOVA pinned cores (i.e., non-cpu 0) +# - remove interference with cpu 0 +if [ "${AFFINE_PINNED}" -eq 1 ]; then + NOVA_CONF=/etc/nova/compute_extend.conf + if [ -f "${NOVA_CONF}" ]; then + source "${NOVA_CONF}" + CPULIST=${compute_pinned_cpulist} + else + CPULIST="" + fi +fi +set_affinity ${CPULIST} + +# Define output directory +if [[ "${HOSTNAME}" =~ "controller-" ]]; then + TOOL_DEST_DIR=/scratch/syseng_data/${HOSTNAME} +elif [[ "${HOSTNAME}" =~ "compute-" ]]; then + TOOL_DEST_DIR=/tmp/syseng_data/${HOSTNAME} +else + TOOL_DEST_DIR=/tmp/syseng_data/${HOSTNAME} +fi +mkdir -p ${TOOL_DEST_DIR} + +# Define daemon log output +timestamp=$( date +"%Y-%0m-%0e_%H%M" ) +DAEMON_OUT="${TOOL_DEST_DIR}/${HOSTNAME}_${timestamp}_${TOOLNAME}.log" + +# Redirect stdout and append to log if not connected to TTY +if test ! -t 1 ; then + exec 1>> ${DAEMON_OUT} +fi + +# Get current availability and usage +get_current_avail_usage + +# Calculate disk usage and availability purge targets +df_offset_bytes=$[$PURGE_HEADROOM_MB*1024*1024] +tgt_used_bytes=$[$PURGE_MAXUSAGE_MB*1024*1024] +((tgt_avail_bytes = df_size_bytes/100*PURGE_HEADROOM_PERCENT + df_offset_bytes)) + +# Set granularity based on duration +if [ $PERIOD_MIN -le 30 ]; then + GRAN_MIN=5 +else + GRAN_MIN=60 +fi + +# Adjust repeats and intervals based on GRAN_MIN granularity +PERIOD_MIN=$[($PERIOD_MIN+(GRAN_MIN-1))/GRAN_MIN*GRAN_MIN] +((REPEATS = PERIOD_MIN/GRAN_MIN)) +GRAN_MIN_IN_SEC=$[$GRAN_MIN*60] +if [ ${INTERVAL_SEC} -gt ${GRAN_MIN_IN_SEC} ]; then + INTERVAL_SEC=${GRAN_MIN_IN_SEC} +fi + +# Define tools and options +# [ JGAULD - need config file for customization; long soak vs specific tools ] +# [ Ideally sample < 5 second granularity, but files get big, and tool has cpu overhead ] +# [ Need < 5 second granularity to see cache pressure/flush issues ] +# [ Desire 60 sec interval for soak ] +if [ ${OPT_SOAK} -eq 1 ]; then + # Desire 60 second or greater interval for longer term data collections, + # otherwise collection files get too big. + schedtop_interval=20 + occtop_interval=60 + memtop_interval=60 + netstats_interval=60 + # JGAULD: temporarily increase frequency to 1 min + postgres_interval=${DUR_1MIN_IN_SEC} + #postgres_interval=${DUR_15MIN_IN_SEC} + rabbitmq_interval=${DUR_15MIN_IN_SEC} + ceph_interval=${DUR_15MIN_IN_SEC} + diskstats_interval=${DUR_15MIN_IN_SEC} + memstats_interval=${DUR_15MIN_IN_SEC} + filestats_interval=${DUR_15MIN_IN_SEC} +elif [ ${OPT_SOAK} -eq 2 ]; then + # Assume much shorter collection (eg, < hours) + schedtop_interval=2 # i.e., 2 second interval + occtop_interval=2 # i.e., 2 second interval + memtop_interval=1 # i.e., 1 second interval + netstats_interval=30 # i.e., 30 second interval + postgres_interval=${DUR_5MIN_IN_SEC} + rabbitmq_interval=${DUR_5MIN_IN_SEC} + ceph_interval=${DUR_5MIN_IN_SEC} + diskstats_interval=${DUR_5MIN_IN_SEC} + memstats_interval=${DUR_5MIN_IN_SEC} + filestats_interval=${DUR_5MIN_IN_SEC} +else + # Assume shorter collection (eg, < a few hours) + schedtop_interval=5 # i.e., 5 second interval + occtop_interval=5 # i.e., 5 second interval + memtop_interval=5 # i.e., 5 second interval + netstats_interval=30 # i.e., 30 second interval + postgres_interval=${DUR_5MIN_IN_SEC} + rabbitmq_interval=${DUR_5MIN_IN_SEC} + ceph_interval=${DUR_5MIN_IN_SEC} + diskstats_interval=${DUR_5MIN_IN_SEC} + memstats_interval=${DUR_5MIN_IN_SEC} + filestats_interval=${DUR_5MIN_IN_SEC} +fi +schedtop_repeat=$[ $PERIOD_MIN * 60 / $schedtop_interval ] +occtop_repeat=$[ $PERIOD_MIN * 60 / $occtop_interval ] +memtop_repeat=$[ $PERIOD_MIN * 60 / $memtop_interval ] +netstats_repeat=$[ $PERIOD_MIN * 60 / $netstats_interval ] + +# Disable use of INTERVAL_SEC sample interval +OPT_USE_INTERVALS=0 + +# Define parallel engtools configuration +# - tool name, filename, and collection interval attributes +BINDIR=/usr/bin +LBINDIR=/usr/local/bin + +while IFS='' read -r line || [[ -n "$line" ]]; do + if [[ $line =~ 'ENABLE_STATIC_COLLECTION'* ]]; then + static_collection=${line:25:1} + fi +done < /etc/engtools/engtools.conf + +declare -a tlist +if [[ $static_collection == "Y" ]] || [[ $static_collection == "y" ]]; then + tlist+=( "tool=${LBINDIR}/top.sh name=top period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" ) + tlist+=( "tool=${LBINDIR}/iostat.sh name=iostat period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" ) + tlist+=( "tool=${LBINDIR}/netstats.sh name=netstats period=${PERIOD_MIN} interval=${netstats_interval}" ) + tlist+=( "tool=${BINDIR}/occtop name=occtop repeat=${occtop_repeat} interval=${occtop_interval}" ) + tlist+=( "tool=${BINDIR}/memtop name=memtop repeat=${memtop_repeat} interval=${memtop_interval}" ) + tlist+=( "tool=${BINDIR}/schedtop name=schedtop repeat=${schedtop_repeat} interval=${schedtop_interval}" ) + tlist+=( "tool=${LBINDIR}/diskstats.sh name=diskstats period=${PERIOD_MIN} interval=${diskstats_interval}" ) + tlist+=( "tool=${LBINDIR}/memstats.sh name=memstats period=${PERIOD_MIN} interval=${memstats_interval}" ) + tlist+=( "tool=${LBINDIR}/filestats.sh name=filestats period=${PERIOD_MIN} interval=${filestats_interval}" ) + if [[ "${HOSTNAME}" =~ "controller-" ]]; then + tlist+=( "tool=${LBINDIR}/ceph.sh name=ceph period=${PERIOD_MIN} interval=${ceph_interval}" ) + tlist+=( "tool=${LBINDIR}/postgres.sh name=postgres period=${PERIOD_MIN} interval=${postgres_interval}" ) + # tlist+=( "tool=${LBINDIR}/rabbitmq.sh name=rabbitmq period=${PERIOD_MIN} interval=${rabbitmq_interval}" ) + elif [[ "${HOSTNAME}" =~ "compute-" ]]; then + tlist+=( "tool=${LBINDIR}/vswitch.sh name=vswitch period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" ) + fi +fi + +# ticker - shows progress on the screen +tlist+=( "tool=${LBINDIR}/ticker.sh name= period=${PERIOD_MIN} interval=${DUR_1MIN_IN_SEC}" ) + + +#------------------------------------------------------------------------------- +# Main loop +#------------------------------------------------------------------------------- +OPT_DEBUG=0 +REP=0 +while [[ ${TOOL_USR1_SIGNAL} -eq 0 ]] && + [[ ${OPT_FOREVER} -eq 1 || ${REP} -lt ${REPEATS} ]] +do + # increment loop counter + ((REP++)) + + # purge oldest files + purge_oldest_files + + # define filename timestamp + timestamp=$( date +"%Y-%0m-%0e_%H%M" ) + + # collect tools in parallel to separate output files + LOG "collecting ${TOOLNAME} at ${timestamp} for ${PERIOD_MIN} mins, repeat=${REP}" + do_parallel_commands + wait + + # Compress latest increment + LOG "compressing: ${parallel_outfiles[@]}" + ${CMD_IDLE} bzip2 -q -f ${parallel_outfiles[@]} 2>/dev/null & +done + +# wait for compression to complete +wait + +tools_cleanup 0 +exit 0 diff --git a/tools/engtools/hostdata-collectors/scripts/diskstats.sh b/tools/engtools/hostdata-collectors/scripts/diskstats.sh new file mode 100644 index 000000000..376dbf185 --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/diskstats.sh @@ -0,0 +1,122 @@ +#!/bin/bash +# Usage: diskstats.sh +TOOLBIN=$(dirname $0) + +# Initialize tools environment variables, and define common utility functions +. ${TOOLBIN}/engtools_util.sh +tools_init +if [ $? -ne 0 ]; then + echo "FATAL, tools_init - could not setup environment" + exit $? +fi + +# Enable use of INTERVAL_SEC sample interval +OPT_USE_INTERVALS=1 + +# Print disk summary +function print_disk() +{ + print_separator + TOOL_HIRES_TIME + + # NOTES: + # --total (grand-total) is a new option, but don't necessarily want to add tmpfs + # or dummy filesystems. + # - use -H to print in SI (eg, GB, vs GiB) + # - can use -a to print all filesystems including dummy filesystems, but then + # there can be double-counting: + print_separator + cmd='df -h -H -T --local -t ext2 -t ext3 -t ext4 -t xfs --total' + ${ECHO} "Disk space usage ext2,ext3,ext4,xfs,tmpfs (SI):" + ${ECHO} "# ${cmd}" ; ${cmd} ; ${ECHO} + + print_separator + cmd='df -h -H -T --local -i -t ext2 -t ext3 -t ext4 -t xfs --total' + ${ECHO} "Disk inodes usage ext2,ext3,ext4,xfs,tmpfs (SI):" + ${ECHO} "# ${cmd}" ; ${cmd} ; ${ECHO} + + print_separator + cmd='drbd-overview' + ${ECHO} "drbd disk usage and status:" + ${ECHO} "# ${cmd}" ; ${cmd} ; ${ECHO} + + print_separator + cmd='lvs' + ${ECHO} "logical volumes usage and status:" + ${ECHO} "# ${cmd}" ; ${cmd} ; ${ECHO} + + print_separator + cmd='pvs' + ${ECHO} "physical volumes usage and status:" + ${ECHO} "# ${cmd}" ; ${cmd} ; ${ECHO} + + print_separator + cmd='vgs' + ${ECHO} "volume groups usage and status:" + ${ECHO} "# ${cmd}" ; ${cmd} ; ${ECHO} +} + +# Print disk static summary +function print_disk_static() +{ + print_separator + cmd='cat /proc/scsi/scsi' + ${ECHO} "Attached devices: ${cmd}" + ${cmd} + ${ECHO} + + # fdisk - requires sudo/root + print_separator + cmd='fdisk -l' + if [ $UID -eq 0 ]; then + ${ECHO} "List disk devices: ${cmd}" + ${cmd} + else + WARNLOG "Skipping cmd=${cmd}, root/sudo passwd required" + fi + ${ECHO} + + # parted - requires sudo/root + print_separator + cmd='parted -l' + if [ $UID -eq 0 ]; then + ${ECHO} "List disk devices: ${cmd}" + ${cmd} + else + WARNLOG "Skipping cmd=${cmd}, root/sudo passwd required" + fi + ${ECHO} +} + +#------------------------------------------------------------------------------- +# MAIN Program: +#------------------------------------------------------------------------------- +# Parse input options +tools_parse_options "${@}" + +# Set affinity of current script +CPULIST="" +set_affinity ${CPULIST} + +LOG "collecting ${TOOLNAME} for ${PERIOD_MIN} minutes, with ${INTERVAL_SEC} second sample intervals." + +# Print tools generic tools header +tools_header + +# Print static disk information +print_disk_static + +# Calculate number of sample repeats based on overall interval and sampling interval +((REPEATS = PERIOD_MIN * 60 / INTERVAL_SEC)) + +for ((rep=1; rep <= REPEATS ; rep++)) +do + print_disk + sleep ${INTERVAL_SEC} +done +print_disk +LOG "done" + +# normal program exit +tools_cleanup 0 +exit 0 diff --git a/tools/engtools/hostdata-collectors/scripts/engtools_util.sh b/tools/engtools/hostdata-collectors/scripts/engtools_util.sh new file mode 100644 index 000000000..311ccd2bf --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/engtools_util.sh @@ -0,0 +1,478 @@ +#!/bin/bash +TOOLNAME=$(basename $0) +PIDFILE=/var/run/${TOOLNAME}.pid +TOOL_DEBUG=1 +TOOL_EXIT_SIGNAL=0 +TOOL_USR1_SIGNAL=0 +TOOL_USR2_SIGNAL=0 +TOOL_TTY=0 +if tty 1>/dev/null ; then + TOOL_TTY=1 +fi + +# [ JGAULD : SHOULD RENAME TO TOOL_X ] +OPT_USE_INTERVALS=0 +OPT_FOREVER=0 +PERIOD_MIN=5 +INTERVAL_SEC=60 +CPULIST=0 + +# Include lsb functions +if [ -d /lib/lsb ]; then +. /lib/lsb/init-functions +else +. /etc/init.d/functions +fi +# Lightweight replacement for pidofproc -p +function check_pidfile () +{ + local pidfile pid + + OPTIND=1 + while getopts p: opt ; do + case "$opt" in + p) + pidfile="$OPTARG" + ;; + esac + done + shift $(($OPTIND - 1)) + + read pid < "${pidfile}" + if [ -n "${pid:-}" ]; then + if $(kill -0 "${pid:-}" 2> /dev/null); then + echo "$pid" + return 0 + elif ps "${pid:-}" >/dev/null 2>&1; then + echo "$pid" + return 0 # program is running, but not owned by this user + else + return 1 # program is dead and /var/run pid file exists + fi + fi +} + +# tools_init - initialize tool resources +function tools_init () +{ + local rc=0 + local error=0 + TOOLNAME=$(basename $0) + + # Check for sufficient priviledges + if [ $UID -ne 0 ]; then + ERRLOG "${NAME} requires sudo/root access." + return 1 + fi + + # Check for essential binaries + ECHO=$(which echo 2>/dev/null) + rc=$? + if [ $rc -ne 0 ]; then + ECHO=echo # use bash built-in echo + ${ECHO} "FATAL, 'echo' not found, rc=$rc"; + error=$rc + fi + DATE=$(which date 2>/dev/null) + rc=$? + if [ $rc -ne 0 ]; then + ${ECHO} "FATAL, 'date' not found, rc=$rc"; + error=$rc + fi + + # Check for standard linux binaries, at least can use LOG functions now + # - these are used in tools_header + CAT=$(which cat 2>/dev/null) + rc=$? + if [ $rc -ne 0 ]; then + ERRLOG "'cat' not found, rc=$rc"; + error=$rc + fi + + ARCH=$(which arch 2>/dev/null) + rc=$? + if [ $rc -ne 0 ]; then + ERRLOG "'arch' not found, rc=$rc"; + error=$rc + fi + + SED=$(which sed 2>/dev/null) + rc=$? + if [ $rc -ne 0 ]; then + ERRLOG "'sed' not found, rc=$rc"; + error=$rc + fi + + GREP=$(which grep 2>/dev/null) + rc=$? + if [ $rc -ne 0 ]; then + ERRLOG "'grep' not found, rc=$rc"; + error=$rc + fi + + WC=$(which wc 2>/dev/null) + rc=$? + if [ $rc -ne 0 ]; then + ERRLOG "'wc' not found, rc=$rc"; + error=$rc + fi + + UNAME=$(which uname 2>/dev/null) + rc=$? + if [ $rc -ne 0 ]; then + ERRLOG "'uname' not found, rc=$rc"; + error=$rc + fi + + SORT=$(which sort 2>/dev/null) + rc=$? + if [ $rc -ne 0 ]; then + ERRLOG "'sort' not found, rc=$rc"; + error=$rc + fi + + TR=$(which tr 2>/dev/null) + rc=$? + if [ $rc -ne 0 ]; then + ERRLOG "'tr' not found, rc=$rc"; + error=$rc + fi + + AWK=$(which awk 2>/dev/null) + rc=$? + if [ $rc -ne 0 ]; then + ERRLOG "'awk' not found, rc=$rc"; + error=$rc + fi + + PKILL=$(which pkill 2>/dev/null) + rc=$? + if [ $rc -ne 0 ]; then + ERRLOG "'pkill' not found, rc=$rc"; + error=$rc + fi + + LS=$(which ls 2>/dev/null) + rc=$? + if [ $rc -ne 0 ]; then + ERRLOG "'ls' not found, rc=$rc"; + error=$rc + fi + + # The following block is needed for LSB systems such as Windriver Linux. + # The utility is not available on CentOS so comment it out. + # Generic utility, but may not be available + # LSB=$(which lsb_release 2>/dev/null) + # rc=$? + # if [ $rc -ne 0 ]; then + # WARNLOG "'lsb_release' not found, rc=$rc"; + # fi + + # Let parent program decide what to do with the errors, + # give ominous warning + if [ $error -eq 1 ]; then + WARNLOG "possibly cannot continue, missing linux binaries" + fi + + # Check if tool was previously running + if [ -e ${PIDFILE} ]; then + # [ JGAULD - remove pidofproc() / LSB compatibility issue ] + if check_pidfile -p "${PIDFILE}" >/dev/null; then + ERRLOG "${PIDFILE} exists and ${TOOLNAME} is running" + return 1 + else + # remove pid file + WARNLOG "${PIDFILE} exists but ${TOOLNAME} is not running; cleaning up" + rm -f ${PIDFILE} + fi + fi + + # Create pid file + echo $$ > ${PIDFILE} + + # Setup trap handler - these signals trigger child shutdown and cleanup + trap tools_exit_handler INT HUP TERM EXIT + trap tools_usr1_handler USR1 + trap tools_usr2_handler USR2 + + return ${rc} +} + +# tools_cleanup() - terminate child processes +function tools_cleanup() { + # restore signal handling to default behaviour + trap - INT HUP TERM EXIT + trap - USR1 USR2 + + local VERBOSE_OPT='' + if [ "$1" -ne "0" ]; then + LOG "cleanup invoked with code: $1" + if [ ${TOOL_DEBUG} -ne 0 ]; then + VERBOSE_OPT='-v' + fi + fi + + + # stop all processes launched from this process + pkill -TERM -P $$ + if [ "$1" -ne "0" ]; then + sleep 1 + fi + + # OK, if the above didn't work, use force + pkill -KILL -P $$ + + # remove pid file + if [ -e ${PIDFILE} ]; then + rm -f ${VERBOSE_OPT} ${PIDFILE} + fi + exit $1 +} + +# tools_exit_handler() - exit handler routine +function tools_exit_handler() { + TOOL_EXIT_SIGNAL=1 + tools_cleanup 128 +} +# tools_usr1_handler() - USR1 handler routine +function tools_usr1_handler() { + TOOL_USR1_SIGNAL=1 + LOG "caught USR1" +} +# tools_usr2_handler() - USR2 handler routine +function tools_usr2_handler() { + TOOL_USR2_SIGNAL=1 + LOG "caught USR1" +} + +# LOG(), WARNLOG(), ERRLOG() - simple print log functions (not logger) +function LOG () +{ + local tstamp_H=$( date +"%Y-%0m-%0e %H:%M:%S" ) + echo "${tstamp_H} ${HOSTNAME} $0($$): $@"; +} +function LOG_NOCR () +{ + local tstamp_H=$( date +"%Y-%0m-%0e %H:%M:%S" ) + echo -n "${tstamp_H} ${HOSTNAME} $0($$): $@"; +} +function WARNLOG () { LOG "WARN $@"; } +function ERRLOG () { LOG "ERROR $@"; } + +# TOOL_HIRES_TIME() - easily parsed date/timestamp and hi-resolution uptime +function TOOL_HIRES_TIME() +{ + echo "time: " $( ${DATE} +"%a %F %H:%M:%S.%N %Z %z" ) "uptime: " $( cat /proc/uptime ) +} + +# set_affinity() - set affinity for current script if a a CPULIST is defined +function set_affinity() { + local CPULIST=$1 + if [ -z "${CPULIST}" ]; then + return + fi + + # Set cpu affinity for current program + local TASKSET=$(which taskset 2>/dev/null) + if [ -x "${TASKSET}" ]; then + ${TASKSET} -pc ${CPULIST} $$ 2>/dev/null + fi +} + +# cmd_idle_priority() - command to set nice + ionice +function cmd_idle_priority() { + local NICE="" + local IONICE="" + + NICE=$( which nice 2>/dev/null ) + if [ $? -eq 0 ]; then + NICE="${NICE} -n 19" + else + NICE="" + fi + IONICE=$( which ionice 2>/dev/null ) + if [ $? -eq 0 ]; then + IONICE="${IONICE} -c 3" + else + IONICE="" + fi + echo "${NICE} ${IONICE}" +} + + +# print_separator() - print a horizontal separation line '\u002d' is '-' +function print_separator () { + printf '\u002d%.s' {1..80} + printf '\n' +} + +# tools_header() - print out common GenWare tools header +function tools_header() { + local TOOLNAME=$(basename $0) + + # Get timestamp + #local tstamp=$( date +"%Y-%0m-%0e %H:%M:%S" 2>/dev/null ) + local tstamp=$( date --rfc-3339=ns | cut -c1-23 2>/dev/null ) + + # Linux Generic + local UPTIME=/proc/uptime + + # Get number of online cpus + local CPUINFO=/proc/cpuinfo + local online_cpus=$( cat ${CPUINFO} | grep -i ^processor | wc -l 2>/dev/null ) + + # Get load average, run-queue size, and number of threads + local LOADAVG=/proc/loadavg + local LDAVG=( `cat ${LOADAVG} | sed -e 's#[/]# #g' 2>/dev/null` ) + + # Get current architecture + local arch=$( uname -m ) + + # Determine processor name (there are many different formats... *sigh* ) + # - build up info from multiple lines + local processor='unk' + local NAME=$( cat ${CPUINFO} | grep \ + -e '^cpu\W\W:' \ + -e ^'cpu model' \ + -e ^'model name' \ + -e ^'system type' \ + -e ^Processor \ + -e ^[Mm]achine | \ + sort -u | awk 'BEGIN{FS=":";} {print $2;}' | \ + tr '\n' ' ' | tr -s [:blank:] 2>/dev/null ) + if [ ! -z "${NAME}" ]; then + processor=${NAME} + fi + + # Determine processor speed (abort grep after first match) + local speed='unk' + local BOGO=$( cat ${CPUINFO} | grep -m1 -e ^BogoMIPS -e ^bogomips | \ + awk 'BEGIN{FS=":";} {printf "%.1f", $2;}' 2>/dev/null ) + local MHZ=$( cat ${CPUINFO} | grep -m1 -e ^'cpu MHz' -e ^clock | \ + awk 'BEGIN{FS=":";} {printf "%.1f", $2;}' 2>/dev/null ) + local MHZ2=$( cat ${CPUINFO} | grep -m1 -e ^Cpu0ClkTck -e ^'cycle frequency' | \ + awk 'BEGIN{FS=":";} {printf "%.1f", $2/1.0E6;}' 2>/dev/null ) + if [ ! -z "${MHZ}" ]; then + speed=${MHZ} + elif [ ! -z "${MHZ2}" ]; then + speed=${MHZ2} + elif [ ! -z ${BOGO} ]; then + speed=${BOGO} + fi + + # Determine OS and kernel version + local os_name=$( uname -s 2>/dev/null ) + local os_release=$( uname -r 2>/dev/null ) + + declare -a arr + + local dist_id="" + # Determine OS distribution ID + if [ lsb_pres == "yes" ]; then + arr=( $( lsb_release -i 2>/dev/null ) ) + dist_id=${arr[2]} + else + local dist_id=$(cat /etc/centos-release | awk '{print $1}' 2>/dev/null) + fi + + local dist_rel="" + if [ lsb_pres == "yes" ]; then + # Determine OS distribution release + arr=( $( cat /proc/version | awk '{print $3}' 2>/dev/null ) ) + local dist_rel=${arr[1]} + else + local dist_rel=$(cat /etc/centos-release | awk '{print $4}' 2>/dev/null) + fi + # Print generic header + echo "${TOOLNAME} -- ${tstamp} load average:${LDAVG[0]}, ${LDAVG[1]}, ${LDAVG[2]} runq:${LDAVG[3]} nproc:${LDAVG[4]}" + echo " host:${HOSTNAME} Distribution:${dist_id} ${dist_rel} ${os_name} ${os_release}" + echo " arch:${arch} processor:${processor} speed:${speed} MHz CPUs:${online_cpus}" +} + + + + +# tools_usage() - show generic tools tool usage +function tools_usage() { + if [ ${OPT_USE_INTERVALS} -eq 1 ]; then + echo "usage: ${TOOLNAME} [-f] [-p ] [-i ] [-c ] [-h]" + else + echo "Usage: ${TOOLNAME} [-f] [-p ] [-c ] [-h]" + fi +} + +# tools_print_help() - print generic tool help +function tools_print_help() { + tools_usage + echo + echo "Options:"; + echo " -f : collect forever : default: none" + echo " -p : overall collection period (minutes) : default: ${DEFAULT_PERIOD_MIN}" + if [ ${OPT_USE_INTERVALS} -eq 1 ]; then + echo " -i : sample interval (seconds) : default: ${DEFAULT_INTERVAL_SEC}" + fi + echo " -c : cpu list where tool runs (e.g., 0-1,8) : default: none" + echo + if [ ${OPT_USE_INTERVALS} -eq 1 ]; then + echo "Example: collect 5 minute period, sample every 30 seconds interval" + echo " ${TOOLNAME} -p 5 -i 30" + else + echo "Example: collect 5 minute period" + echo " ${TOOLNAME} -p 5" + fi +} + +# tools_parse_options() -- parse common options for tools scripts +function tools_parse_options() { + # check for no arguments, print usage + if [ $# -eq "0" ]; then + tools_usage + tools_cleanup 0 + exit 0 + fi + + # parse the input arguments + while getopts "fp:i:c:h" Option + do + case $Option in + f) + OPT_FOREVER=1 + PERIOD_MIN=60 + ;; + p) PERIOD_MIN=$OPTARG ;; + i) + OPT_USE_INTERVALS=1 + INTERVAL_SEC=$OPTARG + ;; + c) CPULIST=$OPTARG ;; + h) + tools_print_help + tools_cleanup 0 + exit 0 + ;; + *) + tools_usage + tools_cleanup 0 + exit 0 + ;; + esac + done + + # validate input arguments + PERIOD_MAX=$[4*24*60] + INTERVAL_MAX=$[60*60] + + error=0 + if [[ ${PERIOD_MIN} -lt 1 || ${PERIOD_MIN} -gt ${PERIOD_MAX} ]]; then + echo "-p must be > 0 and <= ${PERIOD_MAX}." + error=1 + fi + if [[ ${INTERVAL_SEC} -lt 1 || ${INTERVAL_SEC} -gt ${INTERVAL_MAX} ]]; then + echo "-i must be > 0 and <= ${INTERVAL_MAX}." + error=1 + fi + if [ ${error} -eq 1 ]; then + tools_cleanup 0 + exit 1 + fi +} diff --git a/tools/engtools/hostdata-collectors/scripts/filestats.sh b/tools/engtools/hostdata-collectors/scripts/filestats.sh new file mode 100644 index 000000000..19d38a704 --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/filestats.sh @@ -0,0 +1,98 @@ +#!/bin/bash +# Usage: filestats.sh [-p ] [-i ] [-c ] [-h] +TOOLBIN=$(dirname $0) + +# Initialize tools environment variables, and define common utility functions +. ${TOOLBIN}/engtools_util.sh +tools_init +if [ $? -ne 0 ]; then + echo "FATAL, tools_init - could not setup environment" + exit $? +fi + +PAGE_SIZE=$(getconf PAGE_SIZE) + +# Enable use of INTERVAL_SEC sample interval +OPT_USE_INTERVALS=1 + + +function print_files() +{ + print_separator + TOOL_HIRES_TIME + + ${ECHO} "# ls -l /proc/*/fd" + sudo ls -l /proc/*/fd 2>/dev/null | awk \ + '$11 ~ /socket/ {a += 1} ; \ + $11 ~ /null/ {b += 1} ; \ + {c += 1} \ + END {\ + {printf "%-10s %-10s %-10s %-10s\n", "TOTAL", "FILES", "SOCKETS", "NULL PIPES"} \ + {printf "%-10s %-10s %-10s %-10s\n", c, c-(a+b) , a, b}}' + + ${ECHO} + + ${ECHO} "# lsof" + printf "%-7s %-7s %-6s %-6s %-6s %-6s %-6s %-6s %-6s %-6s %-6s %-6s %s\n" "PID" "TOTAL" "FD" "U" "W" "R" "CWD" "RTD" "TXT" "MEM" "DEL" "TCP" "CMD" + sudo lsof +c 15| awk '$3 !~ /^[0-9]+/{ {pids[$2]["COMMAND"]=$1}\ + {pids[$2]["PID"]=$2}\ + {pids[$2]["TOTAL"]+=1}\ + {pids[$2]["TCP"]+=($8=="TCP")? 1 : 0}\ + {($4 ~ /^[0-9][0-9]*[urw]/ )? \ + pids[$2][substr($4, length($4),1)]+=1 : pids[$2][$4]+=1} } + END { + { for (i in pids) \ + if(pids[i]["PID"]!="PID") { + {printf "%-7s %-7s %-6s %-6s %-6s %-6s %-6s %-6s %-6s %-6s %-6s %-6s %s\n", \ + pids[i]["PID"], \ + pids[i]["TOTAL"],\ + ((pids[i]["u"]!="")? pids[i]["u"] : 0) + ((pids[i]["w"]!="")? pids[i]["w"] : 0 )+ ((pids[i]["r"]!="")? pids[i]["r"] : 0),\ + (pids[i]["u"]!="")? pids[i]["u"] : 0,\ + (pids[i]["w"]!="")? pids[i]["w"] : 0,\ + (pids[i]["r"]!="")? pids[i]["r"] : 0,\ + (pids[i]["cwd"]!="")? pids[i]["cwd"] : 0,\ + (pids[i]["rtd"]!="")? pids[i]["rtd"] : 0,\ + (pids[i]["txt"]!="")? pids[i]["txt"] : 0,\ + (pids[i]["mem"]!="")? pids[i]["mem"] : 0,\ + (pids[i]["DEL"]!="")? pids[i]["DEL"] : 0,\ + (pids[i]["TCP"]!="")? pids[i]["TCP"] : 0,\ + pids[i]["COMMAND"]} }}}' | sort -n -r -k3 + + ${ECHO} + + ${ECHO} "# lsof -nP +L1" + sudo lsof -nP +L1 + ${ECHO} +} + + + +#------------------------------------------------------------------------------- +# MAIN Program: +#------------------------------------------------------------------------------- +# Parse input options +tools_parse_options "${@}" + +# Set affinity of current script +CPULIST="" +set_affinity ${CPULIST} + +LOG "collecting ${TOOLNAME} for ${PERIOD_MIN} minutes, with ${INTERVAL_SEC} second sample intervals." + +# Print tools generic tools header +tools_header + +# Calculate number of sample repeats based on overall interval and sampling interval +((REPEATS = PERIOD_MIN * 60 / INTERVAL_SEC)) + +for ((rep=1; rep <= REPEATS ; rep++)) +do + print_files + sleep ${INTERVAL_SEC} +done +print_files +LOG "done" + +# normal program exit +tools_cleanup 0 +exit 0 diff --git a/tools/engtools/hostdata-collectors/scripts/init.d/collect-engtools.sh b/tools/engtools/hostdata-collectors/scripts/init.d/collect-engtools.sh new file mode 100644 index 000000000..5bc7b6b2d --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/init.d/collect-engtools.sh @@ -0,0 +1,120 @@ +#!/bin/bash +### BEGIN INIT INFO +# Provides: collect-engtools +# Required-Start: $local_fs $network $syslog postgresql +# Required-Stop: $local_fs $network $syslog postgresql +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: initscript to launch engineering tools data collection daemon +# Description: initscript to launch engineering tools data collection daemon +# Blah. +### END INIT INFO + +PATH=/sbin:/usr/sbin:/bin:/usr/bin +DESC="collect engtools service" +NAME="collect-engtools.sh" +DAEMON=/usr/local/bin/${NAME} +DAEMON_ARGS="-f" +PIDFILE=/var/run/${NAME}.pid +SCRIPTNAME=/etc/init.d/${NAME} +DEFAULTFILE=/etc/default/${NAME} + +# Exit if the package is not installed +[ -x "$DAEMON" ] || exit 0 +. /etc/init.d/functions +# Read configuration variable file if it is present +[ -r $DEFAULTFILE ] && . $DEFAULTFILE + +# Load the VERBOSE setting and other rcS variables +#. /lib/init/vars.sh + +# Define lsb fallback versions of: +# log_daemon_msg(), log_end_msg() +log_daemon_msg() { echo -n "${1:-}: ${2:-}"; } +log_end_msg() { echo "."; } + +# Use lsb functions to perform the operations. +if [ -f /lib/lsb/init-functions ]; then + . /lib/lsb/init-functions +fi + +# Check for sufficient priviledges +# [ JGAULD : possibly provide user = 'operator' option instead... ] +if [ $UID -ne 0 ]; then + log_daemon_msg "Starting ${NAME} requires sudo/root access." + exit 1 +fi + +case $1 in + start) + if [ -e ${PIDFILE} ]; then + pid=$(pidof -x ${NAME}) + if test "${pid}" != "" + then + echo_success "${NAME} already running" + exit + fi + fi + + + log_daemon_msg "Starting ${NAME}" + if start-stop-daemon --start --background --quiet --oknodo --pidfile ${PIDFILE} \ + --exec ${DAEMON} -- ${DAEMON_ARGS} ; then + ./usr/local/bin/live_stream.py & + log_end_msg 0 + else + log_end_msg 1 + fi + ;; + + stop) + if [ -e ${PIDFILE} ]; then + pids=$(pidof -x ${NAME}) + if [[ ! -z "${pids}" ]] + then + echo_success "Stopping ${NAME} [$pid]" + start-stop-daemon --stop --quiet --oknodo --pidfile ${PIDFILE} --retry=TERM/3/KILL/5 + # [ JGAULD: none of the following should be necessary ] + /usr/local/bin/cleanup-engtools.sh + else + echo_failure "${NAME} is not running" + fi + else + echo_failure "${PIDFILE} does not exist" + fi + ;; + + restart) + $0 stop && sleep 2 && $0 start + ;; + + status) + if [ -e ${PIDFILE} ]; then + pid=$(pidof -x ${NAME}) + if test "${pid}" != "" + then + echo_success "${NAME} is running" + else + echo_success "${NAME} is not running" + fi + else + echo_success "${NAME} is not running" + fi + ;; + + reload) + if [ -e ${PIDFILE} ]; then + start-stop-daemon --stop --signal USR1 --quiet --pidfile ${PIDFILE} --name ${NAME} + echo_success "${NAME} reloaded successfully" + else + echo_success "${PIDFILE} does not exist" + fi + ;; + + *) + echo "Usage: $0 {start|stop|restart|reload|status}" + exit 2 + ;; +esac + +exit 0 diff --git a/tools/engtools/hostdata-collectors/scripts/iostat.sh b/tools/engtools/hostdata-collectors/scripts/iostat.sh new file mode 100644 index 000000000..04be90c35 --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/iostat.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# Usage: iostat.sh [-p ] [-i ] [-c ] [-h] +TOOLBIN=$(dirname $0) + +# Initialize tools environment variables, and define common utility functions +. ${TOOLBIN}/engtools_util.sh +tools_init +if [ $? -ne 0 ]; then + echo "FATAL, tools_init - could not setup environment" + exit $? +fi + +# Enable use of INTERVAL_SEC sample interval +OPT_USE_INTERVALS=1 + +IOSTAT=$( which iostat 2>/dev/null ) +if [ $? -ne 0 ]; then + print_separator + WARNLOG "iostat not available" + tools_cleanup 0 +fi + +# MAIN Program: +#------------------------------------------------------------------------------- +# Parse input options +tools_parse_options "${@}" + +# Set affinity of current script +CPULIST="" +set_affinity ${CPULIST} + +LOG "collecting ${TOOLNAME} for ${PERIOD_MIN} minutes, with ${INTERVAL_SEC} second sample intervals." + +# Print tools generic tools header +tools_header + +# Calculate number of sample repeats based on overall interval and sampling interval +((REPEATS = PERIOD_MIN * 60 / INTERVAL_SEC)) +((REP = REPEATS + 1)) + +# Execute tool for specified duration +CMD="${IOSTAT} -k -x -t ${INTERVAL_SEC} ${REP}" +#LOG "CMD: ${CMD}" +${CMD} +LOG "done" + +# normal program exit +tools_cleanup 0 +exit 0 diff --git a/tools/engtools/hostdata-collectors/scripts/linux_benchmark.sh b/tools/engtools/hostdata-collectors/scripts/linux_benchmark.sh new file mode 100644 index 000000000..fb1d16d47 --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/linux_benchmark.sh @@ -0,0 +1,547 @@ +#!/bin/bash + +username="wrsroot" +password="Li69nux*" +test_duration="30" +wait_duration="5" +udp_find_0_frameloss="1" +udp_max_iter="20" +udp_granularity="100000" +result_dir="/home/${username}/benchmark_results" +summary_file="${result_dir}/benchmark_summary.xls" +host="" +remote="" +controllers=() +computes=() +nodes=() +max_compute_node="10" +interfaces=("") +# udp header total length: Ethernet header ( 14 ) + CRC ( 4 ) + IPv4 header ( 20 ) + UDP header ( 8 ) +udp_header_len="46" +# icmp header total length: ICMP header ( 8 ) + IPv4 header ( 20 ) +icmp_header_len="28" +frame_sizes=(64 128 256 512 1024 1280 1518) +ssh_opt="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -q" +# ports used for different kind of traffics except hiprio. these are chosen randomly since they are not used +# 8000 - storage; 8001 - migration; 8002 - default; 8003 - drbd +controller_ports=(8000 8001 8002 8003) +compute_ports=(8000 8001 8002) +traffic_types=(storage migration default drbd) +flow_ids=(1:20 1:30 1:40 1:50) + +function exec_cmd () +{ + node="$1" + cmd="$2" + + if [[ "${node}" == *"${host}"* ]]; then + echo "$(bash -c "${cmd}")" + else + echo "$(ssh ${ssh_opt} ${username}@${node} "${cmd}")" + fi +} + +function iperf3_server_start () +{ + local server="$1" + local result="$2" + local port="$3" + local cmd="iperf3 -s" + + if [ "${port}" ]; then + cmd="${cmd} -p ${port}" + fi + cmd="nohup ${cmd} > ${result} 2>&1 &" + $(exec_cmd "${server}" "${cmd}") +} + +function iperf3_client_tcp_start () +{ + local result="${result_dir}/throughput" + local cmd="" + local client="$1" + local server="$2" + local port="$3" + + cmd="iperf3 -t ${test_duration} -c $(get_ip_addr "${server}")" + if [ "${port}" ]; then + cmd="${cmd} -p ${port} -O ${wait_duration}" + result="${result}_parallel_${port}" + else + result="${result}_tcp" + if [[ "${server}" == *"infra"* ]]; then + result="${result}_infra" + fi + fi + $(exec_cmd "${client}" "${cmd} > ${result} 2>&1") +} + +function iperf3_client_udp_start () +{ + local result="${result_dir}/throughput_udp" + local cmd="" + local client="$1" + local server="$2" + local frame_size="$3" + local bw="0" + + if [ "${4}" ]; then + bw="${4}" + fi + + cmd="iperf3 -u -t ${test_duration} -c $(get_ip_addr ${server})" + if [ ${frame_size} ]; then + cmd="${cmd} -l ${frame_size}" + result="${result}_$[${frame_size}+${udp_header_len}]" + fi + + if [[ ${server} == *"infra"* ]]; then + result="${result}_infra" + fi + + $(exec_cmd "${client}" "${cmd} -b ${bw} >> ${result} 2>&1" ) +} + +function iperf3_stop () +{ + local node="$1" + local cmd="pkill iperf3" + $(exec_cmd "${node}" "${cmd}") +} + +function get_ip_addr () +{ + arp -a | grep -oP "(?<=$1 \()[^)]*" | head -n 1 +} + +function throughput_tcp_test() +{ + for (( i = 0; i < ${#nodes[@]} ; i+=2 )); do + for interface in "${interfaces[@]}"; do + local interface_name="management" + local interface_suffix="" + local result_suffix="" + if [ "${interface}" == "infra" ]; then + interface_name="infrastructure" + interface_suffix="-infra" + result_suffix="_infra" + fi + local result_file="${result_dir}/throughput_tcp${result_suffix}" + printf "Running TCP throughput test between ${nodes[${i}]} and ${nodes[$[${i}+1]]}'s ${interface_name} network..." + iperf3_server_start ${nodes[$[${i}+1]]}${interface_suffix} ${result_file} + iperf3_client_tcp_start ${nodes[${i}]}${interface_suffix} ${nodes[$[${i}+1]]}${interface_suffix} + iperf3_stop ${nodes[$[${i}+1]]}${interface_suffix} + result=$(exec_cmd "${nodes[${i}]}" "awk '/sender/ {print \$7 \" \" \$8}' ${result_file}") + printf " Done (${result})\n" + done + done +} + +function throughput_udp_test () +{ + for (( i = 0; i < ${#nodes[@]} ; i+=2 )); do + for interface in "${interfaces[@]}"; do + local interface_name="management" + local interface_suffix="" + local result_suffix="" + if [ "${interface}" == "infra" ]; then + interface_name="infrastructure" + interface_suffix="-infra" + result_suffix="_infra" + fi + echo "Running UDP throughput test between ${nodes[${i}]} and ${nodes[$[${i}+1]]}'s ${interface_name} network" + for frame_size in "${frame_sizes[@]}"; do + local max_bw="0" + local min_bw="0" + local cur_bw="0" + local old_bw="0" + local result="" + local result_unit="" + local frame_loss="" + local max_result="" + local max_result_unit="" + local max_frame_loss="" + local result_file="${result_dir}/throughput_udp_${frame_size}${result_suffix}" + local iter="0" + local diff="" + printf "\tFrame size = ${frame_size}..." + while true; do + iperf3_server_start ${nodes[$[${i}+1]]}${interface_suffix} ${result_file} + iperf3_client_udp_start ${nodes[${i}]}${interface_suffix} ${nodes[$[${i}+1]]}${interface_suffix} $[${frame_size}-${udp_header_len}] ${cur_bw} + iperf3_stop ${nodes[$[${i}+1]]}${interface_suffix} + result=$(exec_cmd "${nodes[${i}]}" "awk '/%/ {print \$7}' ${result_file} | tail -n1") + result_unit=$(exec_cmd "${nodes[${i}]}" "awk '/%/ {print \$8}' ${result_file} | tail -n1") + frame_loss=$(exec_cmd "${nodes[${i}]}" "awk '/%/ {print \$12}' ${result_file} | tail -n1 | tr -d '()%'") + if [ "${udp_find_0_frameloss}" == "1" ]; then + if [ "${iter}" -eq "0" ]; then + max_result="${result}" + max_result_unit="${result_unit}" + max_frame_loss="${frame_loss}" + fi + if [ $(echo ${frame_loss} | grep e) ]; then + frame_loss="$(echo ${frame_loss} | sed 's/e/*10^/g;s/ /*/' )" + fi + if [ "$(echo "${frame_loss} > 0" | bc -l)" -eq "1" ]; then + max_bw="${result}" + if [ "${result_unit}" == "Kbits/sec" ]; then + max_bw="$(echo "(${max_bw} * 1000) / 1" | bc)" + elif [ "${result_unit}" == "Mbits/sec" ]; then + max_bw="$(echo "(${max_bw} * 1000000) / 1" | bc)" + elif [ "${result_unit}" == "Gbits/sec" ]; then + max_bw="$(echo "(${max_bw} * 1000000000) / 1" | bc)" + fi + else + if [ "${iter}" -eq "0" ]; then + break + else + min_bw="${result}" + if [ "${result_unit}" == "Kbits/sec" ]; then + min_bw="$(echo "(${min_bw} * 1000) / 1" | bc)" + elif [ "${result_unit}" == "Mbits/sec" ]; then + min_bw="$(echo "(${min_bw} * 1000000) / 1" | bc)" + elif [ "${result_unit}" == "Gbits/sec" ]; then + min_bw="$(echo "(${min_bw} * 1000000000) / 1" | bc)" + fi + fi + fi + old_bw="${cur_bw}" + cur_bw="$[(${max_bw} + ${min_bw}) / 2]" + diff="$(echo "$[${cur_bw} - ${old_bw}]" | tr -d '-')" + #break + ((iter++)) + if [ "${diff}" -lt "${udp_granularity}" ]; then + break + fi + if [ "${udp_max_iter}" -ne "0" ] && [ "${iter}" -ge "${udp_max_iter}" ]; then + break + fi + else + break + fi + done + if [ "${udp_find_0_frameloss}" == "1" ]; then + printf " Done (%s %s @ %s%% & %s %s @ %s%%)\n" "${max_result}" "${max_result_unit}" "${max_frame_loss}" "${result}" "${result_unit}" "${frame_loss}" + else + printf " Done (%s %s @ %s%%)\n" "${result}" "${result_unit}" "${frame_loss}" + fi + done + done + done +} + +function throughput_parallel_test () +{ + local dev="" + local ip_addr="" + local interface_name="" + local interface_suffix="" + local result_file="${result_dir}/throughput_parallel" + # get device name of the interface + if [ "${#interfaces[@]}" -gt "1" ]; then + interface_name="infrastructure" + interface_suffix="-infra" + ip_addr=$(ping -c1 ${host}-infra | awk -F'[()]' '/PING/{print $2}') + else + interface_name="management" + ip_addr=$(ping -c1 ${host} | awk -F'[()]' '/PING/{print $2}') + fi + dev=$(ifconfig | grep -B1 "inet ${ip_addr}" | awk '$1!="inet" && $1!="--" {print $1}') + + + # set all the filters + for node in ${nodes[@]}; do + local ports=("${controller_ports[@]}") + if [[ "${node}" == *"compute"* ]]; then + ports=("${compute_ports[@]}") + fi + for i in $(seq 0 $[${#ports[@]} - 1]); do + if [ ${traffic_types[i]} != "default" ]; then + tc_dport="tc filter add dev ${dev} protocol ip parent 1:0 prio 1 u32 match ip protocol 6 0xff match ip dport ${ports[i]} 0xffff flowid ${flow_ids[i]}" + tc_sport="tc filter add dev ${dev} protocol ip parent 1:0 prio 1 u32 match ip protocol 6 0xff match ip sport ${ports[i]} 0xffff flowid ${flow_ids[i]}" + $(exec_cmd "${node}" "echo ${password} | sudo -S bash -c '${tc_dport}; ${tc_sport}' > /dev/null 2>&1") + fi + done + done + + # run the tests + for (( i = 0; i < ${#nodes[@]} ; i+=2 )); do + local ports=("${controller_ports[@]}") + if [[ "${nodes[${i}]}" == *"compute"* ]]; then + ports=("${compute_ports[@]}") + fi + printf "Running parallel throughput test between ${nodes[${i}]} and ${nodes[$[${i}+1]]}'s ${interface_name} network..." + + # start the servers + for port in "${ports[@]}"; do + iperf3_server_start "${nodes[$[${i}+1]]}${interface_suffix}" "${result_file}_${port}" "${port}" + done + #start the clients + for port in "${controller_ports[@]}"; do + iperf3_client_tcp_start ${nodes[${i}]}${interface_suffix} ${nodes[$[${i}+1]]}${interface_suffix} ${port} & + done + sleep $[${test_duration} + ${wait_duration} + 1] + iperf3_stop ${nodes[$[${i}+1]]}${interface_suffix} + printf " Done\n" + + # get results + for j in $(seq 0 $[${#ports[@]} - 1]); do + result=$(exec_cmd "${nodes[${i}]}" "awk '/sender/ {print \$7 \" \" \$8}' ${result_file}_${ports[${j}]}") + printf "\t${traffic_types[$j]} = ${result}\n" + done + done + + # remove all the filters + for node in ${nodes[@]}; do + local handles=() + local ports=("${controller_ports[@]}") + if [[ "${node}" == *"compute"* ]]; then + ports=("${compute_ports[@]}") + fi + handles=($(exec_cmd "${node}" "/usr/sbin/tc filter show dev ${dev} | awk '/filter/ {print \$10}' | tail -n $[(${#ports[@]} - 1) * 2 ]")) + for handle in "${handles[@]}"; do + $(exec_cmd "${node}" "echo ${password} | sudo -S /usr/sbin/tc filter delete dev ${dev} parent 1: handle ${handle} prio 1 u32 > /dev/null 2>&1") + done + done +} + +function latency_test () +{ + for (( i = 0; i < ${#nodes[@]} ; i+=2 )); do + for interface in "${interfaces[@]}"; do + local interface_name="management" + local interface_suffix="" + local result_suffix="" + if [ "${interface}" == "infra" ]; then + interface_name="infrastructure" + interface_suffix="-infra" + result_suffix="_infra" + fi + echo "Running latency test between ${nodes[${i}]} and ${nodes[$[${i}+1]]}'s ${interface_name} network" + for frame_size in "${frame_sizes[@]}"; do + local result_file="${result_dir}/latency_${frame_size}${result_suffix}" + printf "\tFrame size = ${frame_size}..." + $(exec_cmd "${nodes[${i}]}" "ping -s $[${frame_size}-8] -w ${test_duration} -i 0.2 ${nodes[$[${i}+1]]}${interface_suffix} > ${result_file} 2>&1") + result=$(exec_cmd "${nodes[${i}]}" "awk '/rtt/ {print \$2 \" = \" \$4 \" \" \$5}' ${result_file}") + printf " Done (%s)\n" "${result}" + done + done + done +} + +function setup () +{ + for node in ${nodes[@]}; do + iperf3_stop "${node}" + $(exec_cmd "${node}" "rm -rf ${result_dir}; mkdir -p ${result_dir}") + done +} + +function get_remote_results () +{ + for node in ${nodes[@]}; do + if [ "${node}" != "${host}" ]; then + mkdir ${result_dir}/${node} + scp ${ssh_opt} ${username}@${node}:${result_dir}/* ${result_dir}/${node} > /dev/null 2>&1 + fi + done +} + +function get_interface_info () +{ + local dev="" + local ip_addr="" + printf "Network interfaces info\n" >> ${summary_file} + for interface in "${interfaces[@]}"; do + local interface_suffix="" + local interface_name="management" + if [ "${interface}" == "infra" ]; then + interface_name="infrastructure" + interface_suffix="-infra" + fi + ip_addr=$(ping -c1 ${host}${interface_suffix} | awk -F'[()]' '/PING/{print $2}') + dev=$(ifconfig | grep -B1 "inet ${ip_addr}" | awk '$1!="inet" && $1!="--" {print $1}') + printf "%s network interface\n" "${interface_name}" >> ${summary_file} + echo ${password} | sudo -S ethtool ${dev} >> ${summary_file} + done +} + +function generate_summary () +{ + local header="" + local result="" + local result_file="" + + printf "Summary\n\n" > ${summary_file} + printf "Throughput TCP\n" >> ${summary_file} + for (( i = 0; i < ${#nodes[@]} ; i+=2 )); do + for interface in "${interfaces[@]}"; do + local node_type="controller" + local interface_type="mgmt" + local result_suffix="" + if [[ "${nodes[${i}]}" == *"compute"* ]]; then + node_type="compute" + fi + if [ "${interface}" == "infra" ]; then + interface_type="infra" + result_suffix="_infra" + fi + header="${header},${node_type}'s ${interface_type}" + result_file="${result_dir}" + if [ ${node_type} == "compute" ]; then + result_file="${result_file}/${nodes[${i}]}" + fi + result_file="${result_file}/throughput_tcp${result_suffix}" + result="${result},$(awk '/sender/ {print $7 " " $8}' ${result_file})" + done + done + printf "%s\n%s\n\n" "${header}" "${result}" >> ${summary_file} + + printf "Throughput UDP\n" >> ${summary_file} + header=",frame,max throughput,max frameloss" + if [ "${udp_find_0_frameloss}" == "1" ]; then + header="${header},final throughput, final frameloss" + fi + for (( i = 0; i < ${#nodes[@]} ; i+=2 )); do + for interface in "${interfaces[@]}"; do + local node_type="controller" + local interface_type="mgmt" + local result_suffix="" + if [[ "${nodes[${i}]}" == *"compute"* ]]; then + node_type="compute" + fi + if [ "${interface}" == "infra" ]; then + interface_type="infra" + result_suffix="_infra" + fi + printf "%s's %s\n%s\n" "${node_type}" "${interface_type}" "${header}" >> ${summary_file} + result_file=${result_dir} + if [ ${node_type} == "compute" ]; then + result_file="${result_file}/${nodes[${i}]}" + fi + for frame in ${frame_sizes[@]}; do + result="${frame},$(awk '/%/ {print $7 " " $8}' ${result_file}/throughput_udp_${frame}${result_suffix} | head -n1),$(awk '/%/ {print $12}' ${result_file}/throughput_udp_${frame}${result_suffix} | head -n1 | tr -d '()')" + if [ "${udp_find_0_frameloss}" == "1" ]; then + result="${result},$(awk '/%/ {print $7 " " $8}' ${result_file}/throughput_udp_${frame}${result_suffix} | tail -n1),$(awk '/%/ {print $12}' ${result_file}/throughput_udp_${frame}${result_suffix} | tail -n1 | tr -d '()')" + fi + printf ",%s\n" "${result}" >> ${summary_file} + done + printf "\n" >> ${summary_file} + done + done + + printf "Parallel throughput result\n" >> ${summary_file} + header=",Node type" + for traffic_type in "${traffic_types[@]}"; do + header="${header},${traffic_type}" + done + printf "%s\n" "${header}" >> ${summary_file} + for (( i = 0; i < ${#nodes[@]} ; i+=2 )); do + local node_type="controller" + local ports=("${controller_ports[@]}") + if [[ "${nodes[${i}]}" == *"compute"* ]]; then + node_type="compute" + fi + result_file=${result_dir} + if [ ${node_type} == "compute" ]; then + ports=("${compute_ports[@]}") + result_file="${result_file}/${nodes[${i}]}" + fi + result=",${node_type}" + for port in "${ports[@]}"; do + result="${result},$(awk '/sender/ {print $7 " " $8}' ${result_file}/throughput_parallel_${port})" + done + printf "%s\n" "${result}" >> ${summary_file} + done + + printf "\nLatency result in ms\n" >> ${summary_file} + for (( i = 0; i < ${#nodes[@]} ; i+=2 )); do + for interface in "${interfaces[@]}"; do + local node_type="controller" + local interface_type="mgmt" + local result_suffix="" + if [[ "${nodes[${i}]}" == *"compute"* ]]; then + node_type="compute" + fi + if [ "${interface}" == "infra" ]; then + interface_type="infra" + result_suffix="_infra" + fi + printf "%s's %s network\n" "${node_type}" "${interface_type}" >> ${summary_file} + result_file=${result_dir} + if [ ${node_type} == "compute" ]; then + result_file="${result_file}/${nodes[${i}]}" + fi + result_file="${result_file}/latency" + printf ",frame size,%s\n" "$(awk '/rtt/ {print $2}' ${result_file}_${frame_sizes}${result_suffix} | tr '/' ',' )" >> ${summary_file} + for frame_size in "${frame_sizes[@]}"; do + printf ",%s,%s\n" "${frame_size}" "$(awk '/rtt/ {print $4}' ${result_file}_${frame_size}${result_suffix} | tr '/' ',' )" >> ${summary_file} + done + + printf "latency distribution\n" >> ${summary_file} + printf ",frame size" >> ${summary_file} + for (( j = 1; j < "20" ; j+=1 )); do + printf ",%s" "$(echo "scale=3;${j}/100" | bc | awk '{printf "%.3f", $0}')" >> ${summary_file} + done + printf "\n" >> ${summary_file} + for frame_size in "${frame_sizes[@]}"; do + printf ",%s" "${frame_size}" >> ${summary_file} + for (( j = 1; j < "20" ; j+=1 )); do + printf ",%s" "$(grep -c "time=$(echo "scale=2;${j}/100" | bc | awk '{printf "%.2f", $0}')" ${result_file}_${frame_size}${result_suffix})" >> ${summary_file} + done + printf "\n" >> ${summary_file} + done + printf "\n" >> ${summary_file} + done + done + + get_interface_info +} + +echo "Starting linux interface benchmark test. ($(date))" + +# find the nodes to test +host=${HOSTNAME} +if [ "${host}" == "controller-1" ]; then + remote="controller-0" +else + remote="controller-1" +fi + +# at least another controller needs to be reachable +ping -c1 ${remote} > /dev/null 2>&1 +if [ $? -eq 0 ]; then + controllers=(${host} ${remote}) + nodes+=("${controllers[@]}") +else + echo "Stopping test as ${remote} is not reachable" + exit 1 +fi + +# check if infrastructure interface is provisioned +ping -c1 "${remote}-infra" > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "Infrastructure network is provisioned" + interfaces+=("infra") +fi + +# check if there are any compute nodes +for i in $(seq 0 $[${max_compute_node} - 1]); do + ping -c1 compute-${i} > /dev/null 2>&1 + if [ $? -eq 0 ]; then + computes+=("compute-${i}") + if [ ${#computes[@]} -ge "2" ]; then + nodes+=("${computes[@]}") + break + fi + fi +done + +setup +throughput_tcp_test +throughput_udp_test +throughput_parallel_test +latency_test +get_remote_results +generate_summary +echo "Linux interface benchmark test finished. ($(date))" + diff --git a/tools/engtools/hostdata-collectors/scripts/live_stream.py b/tools/engtools/hostdata-collectors/scripts/live_stream.py new file mode 100644 index 000000000..8192048d7 --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/live_stream.py @@ -0,0 +1,1578 @@ +#!/usr/bin/python + +""" +Copyright (c) 2017 Wind River Systems, Inc. + +SPDX-License-Identifier: Apache-2.0 +""" + +import os +import sys +import time +import datetime +import psutil +import fcntl +import logging +import ConfigParser +from multiprocessing import Process, cpu_count +from subprocess import Popen, PIPE +from collections import OrderedDict + + +# generates the required string for the areas where fields are not static +def generateString(meas, tag_n, tag_v, field_n, field_v): + base = "{},".format(meas) + try: + for i in range(len(tag_n)): + if i == len(tag_n) - 1: + # have space between tags and fields + base += "'{}'='{}' ".format(tag_n[i], str(tag_v[i])) + else: + # separate with commas + base += "'{}'='{}',".format(tag_n[i], str(tag_v[i])) + for i in range(len(field_v)): + if str(field_v[i]).replace(".", "").isdigit(): + if i == len(field_v) - 1: + base += "'{}'='{}'".format(field_n[i], str(field_v[i])) + else: + base += "'{}'='{}',".format(field_n[i], str(field_v[i])) + return base + except IndexError: + return None + + +# collects system memory information +def collectMemtop(influx_info, node, ci): + logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) + logging.info("memtop data starting collection with a collection interval of {}s".format(ci["memtop"])) + measurement = "memtop" + tags = {"node": node} + MiB = 1024.0 + while True: + try: + fields = OrderedDict([("total", 0), ("used", 0), ("free", 0), ("cached", 0), ("buf", 0), ("slab", 0), ("cas", 0), ("clim", 0), ("dirty", 0), ("wback", 0), ("anon", 0), ("avail", 0)]) + with open("/proc/meminfo", "r") as f: + hps = 0 + # for each line in /proc/meminfo, match with element in fields + for line in f: + line = line.strip("\n").split() + if line[0].strip(":").startswith("MemTotal"): + # convert to from kibibytes to mibibytes + fields["total"] = float(line[1]) / MiB + elif line[0].strip(":").startswith("MemFree"): + fields["free"] = int(line[1]) / MiB + elif line[0].strip(":").startswith("MemAvailable"): + fields["avail"] = float(line[1]) / MiB + elif line[0].strip(":").startswith("Buffers"): + fields["buf"] = float(line[1]) / MiB + elif line[0].strip(":").startswith("Cached"): + fields["cached"] = float(line[1]) / MiB + elif line[0].strip(":").startswith("Slab"): + fields["slab"] = float(line[1]) / MiB + elif line[0].strip(":").startswith("CommitLimit"): + fields["clim"] = float(line[1]) / MiB + elif line[0].strip(":").startswith("Committed_AS"): + fields["cas"] = float(line[1]) / MiB + elif line[0].strip(":").startswith("Dirty"): + fields["dirty"] = float(line[1]) / MiB + elif line[0].strip(":").startswith("Writeback"): + fields["wback"] = float(line[1]) / MiB + elif line[0].strip(":").endswith("(anon)"): + fields["anon"] += float(line[1]) / MiB + elif line[0].strip(":").endswith("Hugepagesize"): + hps = float(line[1]) / MiB + fields["used"] = fields["total"] - fields["avail"] + f.close() + # get platform specific memory info + fields["platform_avail"] = 0 + fields["platform_hfree"] = 0 + for file in os.listdir("/sys/devices/system/node"): + if file.startswith("node"): + node_num = file.replace("node", "").strip("\n") + avail = hfree = 0 + with open("/sys/devices/system/node/{}/meminfo".format(file)) as f1: + for line in f1: + line = line.strip("\n").split() + if line[2].strip(":").startswith("MemFree") or line[2].strip(":").startswith("FilePages") or line[2].strip(":").startswith("SReclaimable"): + avail += float(line[3]) + elif line[2].strip(":").startswith("HugePages_Free"): + hfree = float(line[3]) * hps + fields["{}:avail".format(node_num)] = avail / MiB + fields["{}:hfree".format(node_num)] = hfree + # get platform sum + fields["platform_avail"] += avail / MiB + fields["platform_hfree"] += hfree + f1.close() + s = generateString(measurement, tags.keys(), tags.values(), fields.keys(), fields.values()) + if s is None: + good_string = False + else: + good_string = True + if good_string: + # send data to InfluxDB + p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], s), shell=True) + p.communicate() + time.sleep(ci["memtop"]) + except KeyboardInterrupt: + break + except Exception: + logging.error("memtop collection stopped unexpectedly with error: {}. Restarting process...".format(sys.exc_info())) + time.sleep(3) + + +# collects rss and vsz information +def collectMemstats(influx_info, node, ci, services, syseng_services, openstack_services, exclude_list, skip_list, collect_all): + logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) + logging.info("memstats data starting collection with a collection interval of {}s".format(ci["memstats"])) + measurement = "memstats" + tags = {"node": node} + ps_output = None + influx_string = "" + while True: + try: + fields = {} + ps_output = Popen("exec ps -e -o rss,vsz,cmd", shell=True, stdout=PIPE) + # create dictionary of dictionaries + if collect_all is False: + for svc in services: + fields[svc] = {"rss": 0, "vsz": 0} + fields["static_syseng"] = {"rss": 0, "vsz": 0} + fields["live_syseng"] = {"rss": 0, "vsz": 0} + fields["total"] = {"rss": 0, "vsz": 0} + ps_output.stdout.readline() + while True: + # for each line in ps output, get rss and vsz info + line = ps_output.stdout.readline().strip("\n").split() + # if at end of output, send data + if not line: + break + else: + rss = float(line[0]) + vsz = float(line[1]) + # go through all command outputs + for i in range(2, len(line)): + # remove unwanted characters and borders from cmd name. Ex: /usr/bin/example.py -> example.py + svc = line[i].replace("(", "").replace(")", "").strip(":").split("/")[-1].strip("\n") + if svc == "gunicorn": + gsvc = line[-1].replace("[", "").replace("]", "").strip("\n") + if gsvc == "public:application": + gsvc = "keystone-public" + elif gsvc == "admin:application": + gsvc = "keystone-admin" + gsvc = "gunicorn_{}".format(gsvc) + if gsvc not in fields: + fields[gsvc] = {"rss": rss, "vsz": vsz} + else: + fields[gsvc]["rss"] += rss + fields[gsvc]["vsz"] += vsz + + elif svc == "postgres": + if line[i + 1].startswith("-") is False and line[i + 1].startswith("_") is False and line[i + 1] != "psql": + psvc = "" + if line[i + 2] in openstack_services: + psvc = line[i + 2].strip("\n") + else: + for j in range(i + 1, len(line)): + psvc += "{}_".format(line[j].strip("\n")) + psvc = "postgres_{}".format(psvc).strip("_") + if psvc not in fields: + fields[psvc] = {"rss": rss, "vsz": vsz} + else: + fields[psvc]["rss"] += rss + fields[psvc]["vsz"] += vsz + + if collect_all is False: + if svc in services: + fields[svc]["rss"] += rss + fields[svc]["vsz"] += vsz + fields["total"]["rss"] += rss + fields["total"]["vsz"] += vsz + break + elif svc in syseng_services: + if svc == "live_stream.py": + fields["live_syseng"]["rss"] += rss + fields["live_syseng"]["vsz"] += vsz + else: + fields["static_syseng"]["rss"] += rss + fields["static_syseng"]["vsz"] += vsz + fields["total"]["rss"] += rss + fields["total"]["vsz"] += vsz + break + # Collect all services + else: + if svc in exclude_list or svc.startswith("-") or svc[0].isdigit() or svc.startswith("[") or svc.endswith("]"): + continue + elif svc in skip_list or svc.startswith("IPaddr"): + break + else: + if svc not in fields: + fields[svc] = {"rss": rss, "vsz": vsz} + else: + fields[svc]["rss"] += rss + fields[svc]["vsz"] += vsz + fields["total"]["rss"] += rss + fields["total"]["vsz"] += vsz + break + # send data to InfluxDB + for key in fields.keys(): + influx_string += "{},'{}'='{}','{}'='{}' '{}'='{}','{}'='{}'".format(measurement, "node", tags["node"], "service", key, "rss", fields[key]["rss"], "vsz", fields[key]["vsz"]) + "\n" + p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string), shell=True) + p.communicate() + influx_string = "" + ps_output.kill() + time.sleep(ci["memstats"]) + except KeyboardInterrupt: + if ps_output is not None: + ps_output.kill() + break + except Exception: + logging.error("memstats collection stopped unexpectedly with error: {}. Restarting process...".format(sys.exc_info())) + time.sleep(3) + + +# collects task cpu information +def collectSchedtop(influx_info, node, ci, services, syseng_services, openstack_services, exclude_list, skip_list, collect_all): + logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) + logging.info("schedtop data starting collection with a collection interval of {}s".format(ci["schedtop"])) + measurement = "schedtop" + tags = {"node": node} + influx_string = "" + top_output = Popen("exec top -b -c -w 512 -d{}".format(ci["schedtop"]), shell=True, stdout=PIPE) + while True: + try: + fields = {} + pro = psutil.Process(top_output.pid) + # if process dies, restart it + if pro.status() == "zombie": + top_output.kill() + top_output = Popen("exec top -b -c -w 512 -d{}".format(ci["schedtop"]), shell=True, stdout=PIPE) + if collect_all is False: + for svc in services: + fields[svc] = 0 + fields["static_syseng"] = 0 + fields["live_syseng"] = 0 + fields["total"] = 0 + # check first line + line = top_output.stdout.readline() + if not line: + pass + else: + # skip header completely + for _ in range(6): + top_output.stdout.readline() + while True: + line = top_output.stdout.readline().strip("\n").split() + # if end of top output, leave this while loop + if not line: + break + else: + occ = float(line[8]) + # for each command listed, check if it matches one from the list + for i in range(11, len(line)): + # remove unwanted characters and borders from cmd name. Ex: /usr/bin/example.py -> example.py + svc = line[i].replace("(", "").replace(")", "").strip(":").split("/")[-1] + if svc == "gunicorn": + gsvc = line[-1].replace("[", "").replace("]", "").strip("\n") + if gsvc == "public:application": + gsvc = "keystone-public" + elif gsvc == "admin:application": + gsvc = "keystone-admin" + gsvc = "gunicorn_{}".format(gsvc) + if gsvc not in fields: + fields[gsvc] = occ + else: + fields[gsvc] += occ + + elif svc == "postgres": + if line[i + 1].startswith("-") is False and line[i + 1].startswith("_") is False and line[i + 1] != "psql": + psvc = "" + if line[i + 2] in openstack_services: + psvc = line[i + 2].strip("\n") + else: + for j in range(i + 1, len(line)): + psvc += "{}_".format(line[j].strip("\n")) + psvc = "postgres_{}".format(psvc).strip("_") + if psvc not in fields: + fields[psvc] = occ + else: + fields[psvc] += occ + + if collect_all is False: + if svc in services: + fields[svc] += occ + fields["total"] += occ + break + elif svc in syseng_services: + if svc == "live_stream.py": + fields["live_syseng"] += occ + else: + fields["static_syseng"] += occ + fields["total"] += occ + break + # Collect all services + else: + if svc in exclude_list or svc.startswith("-") or svc[0].isdigit() or svc.startswith("[") or svc.endswith("]"): + continue + elif svc in skip_list or svc.startswith("IPaddr"): + break + else: + if svc not in fields: + fields[svc] = occ + else: + fields[svc] += occ + fields["total"] += occ + break + for key in fields.keys(): + influx_string += "{},'{}'='{}','{}'='{}' '{}'='{}'".format(measurement, "node", tags["node"], "service", key, "occ", fields[key]) + "\n" + # send data to InfluxDB + p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string), shell=True) + p.communicate() + influx_string = "" + time.sleep(ci["schedtop"]) + except KeyboardInterrupt: + if top_output is not None: + top_output.kill() + break + except Exception: + logging.error("schedtop collection stopped unexpectedly with error: {}. Restarting process...".format(sys.exc_info())) + time.sleep(3) + + +# collects disk utilization information +def collectDiskstats(influx_info, node, ci): + logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) + logging.info("diskstats data starting collection with a collection interval of {}s".format(ci["diskstats"])) + measurement = "diskstats" + tags = {"node": node, "file_system": None, "type": None, "mount": None} + fields = {"size": 0, "used": 0, "avail": 0, "usage": 0} + influx_string = "" + while True: + try: + parts = psutil.disk_partitions() + for i in parts: + # gather all partitions + tags["mount"] = str(i[1]).split("/")[-1] + # if mount == '', call it root + if tags["mount"] == "": + tags["mount"] = "root" + # skip boot + elif tags["mount"] == "boot": + continue + tags["file_system"] = str(i[0]).split("/")[-1] + tags["type"] = i[2] + u = psutil.disk_usage(i[1]) + fields["size"] = u[0] + fields["used"] = u[1] + fields["avail"] = u[2] + fields["usage"] = u[3] + influx_string += "{},'{}'='{}','{}'='{}','{}'='{}','{}'='{}' '{}'='{}','{}'='{}','{}'='{}','{}'='{}'".format(measurement, "node", tags["node"], "file_system", tags["file_system"], "type", tags["type"], "mount", tags["mount"], "size", fields["size"], "used", fields["used"], "avail", fields["avail"], "usage", fields["usage"]) + "\n" + p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string), shell=True) + p.communicate() + influx_string = "" + time.sleep(ci["diskstats"]) + except KeyboardInterrupt: + break + except Exception: + logging.error("diskstats collection stopped unexpectedly with error: {}. Restarting process...".format(sys.exc_info())) + time.sleep(3) + + +# collect device I/O information +def collectIostat(influx_info, node, ci): + logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) + logging.info("iostat data starting collection with a collection interval of {}s".format(ci["iostat"])) + measurement = "iostat" + tags = {"node": node} + sector_size = 512.0 + influx_string = "" + while True: + try: + fields = {} + tmp = {} + tmp1 = {} + start = time.time() + # get initial values + for dev in os.listdir("/sys/block/"): + if dev.startswith("sr"): + continue + else: + fields[dev] = {"r/s": 0, "w/s": 0, "io/s": 0, "rkB/s": 0, "wkB/s": 0, "rrqms/s": 0, "wrqms/s": 0, "util": 0} + tmp[dev] = {"init_reads": 0, "init_reads_merged": 0, "init_read_sectors": 0, "init_read_wait": 0, "init_writes": 0, "init_writes_merged": 0, "init_write_sectors": 0, "init_write_wait": 0, "init_io_progress": 0, "init_io_time": 0, "init_wait_time": 0} + with open("/sys/block/{}/stat".format(dev), "r") as f: + # get initial readings + line = f.readline().strip("\n").split() + tmp[dev]["init_reads"] = int(line[0]) + tmp[dev]["init_reads_merged"] = int(line[1]) + tmp[dev]["init_read_sectors"] = int(line[2]) + tmp[dev]["init_read_wait"] = int(line[3]) + tmp[dev]["init_writes"] = int(line[4]) + tmp[dev]["init_writes_merged"] = int(line[5]) + tmp[dev]["init_write_sectors"] = int(line[6]) + tmp[dev]["init_write_wait"] = int(line[7]) + tmp[dev]["init_io_progress"] = int(line[8]) + tmp[dev]["init_io_time"] = int(line[9]) + tmp[dev]["init_wait_time"] = int(line[10]) + time.sleep(ci["iostat"]) + dt = time.time() - start + # get values again + for dev in os.listdir("/sys/block/"): + if dev.startswith("sr"): + continue + else: + # during a swact, some devices may not have been read in the initial reading. If found now, add them to dict + if dev not in fields: + fields[dev] = {"r/s": 0, "w/s": 0, "io/s": 0, "rkB/s": 0, "wkB/s": 0, "rrqms/s": 0, "wrqms/s": 0, "util": 0} + tmp1[dev] = {"reads": 0, "reads_merged": 0, "read_sectors": 0, "read_wait": 0, "writes": 0, "writes_merged": 0, "write_sectors": 0, "write_wait": 0, "io_progress": 0, "io_time": 0, "wait_time": 0} + with open("/sys/block/{}/stat".format(dev), "r") as f: + line = f.readline().strip("\n").split() + tmp1[dev]["reads"] = int(line[0]) + tmp1[dev]["reads_merged"] = int(line[1]) + tmp1[dev]["read_sectors"] = int(line[2]) + tmp1[dev]["read_wait"] = int(line[3]) + tmp1[dev]["writes"] = int(line[4]) + tmp1[dev]["writes_merged"] = int(line[5]) + tmp1[dev]["write_sectors"] = int(line[6]) + tmp1[dev]["write_wait"] = int(line[7]) + tmp1[dev]["io_progress"] = int(line[8]) + tmp1[dev]["io_time"] = int(line[9]) + tmp1[dev]["wait_time"] = int(line[10]) + # take difference and divide by delta t + for key in fields: + # if device was found in initial and second reading, do calculation + if key in tmp and key in tmp1: + fields[key]["r/s"] = abs(tmp1[key]["reads"] - tmp[key]["init_reads"]) / dt + fields[key]["w/s"] = abs(tmp1[key]["writes"] - tmp[key]["init_writes"]) / dt + fields[key]["rkB/s"] = abs(tmp1[key]["read_sectors"] - tmp[key]["init_read_sectors"]) * sector_size / dt / 1000 + fields[key]["wkB/s"] = abs(tmp1[key]["write_sectors"] - tmp[key]["init_write_sectors"]) * sector_size / dt / 1000 + fields[key]["rrqms/s"] = abs(tmp1[key]["reads_merged"] - tmp[key]["init_reads_merged"]) / dt + fields[key]["wrqms/s"] = abs(tmp1[key]["writes_merged"] - tmp[key]["init_writes_merged"]) / dt + fields[key]["io/s"] = fields[key]["r/s"] + fields[key]["w/s"] + fields[key]["rrqms/s"] + fields[key]["wrqms/s"] + fields[key]["util"] = abs(tmp1[key]["io_time"] - tmp[key]["init_io_time"]) / dt / 10 + influx_string += "{},'{}'='{}','{}'='{}' '{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}'".format(measurement, "node", tags["node"], "device", key, "r/s", fields[key]["r/s"], "w/s", fields[key]["w/s"], "rkB/s", fields[key]["rkB/s"], "wkB/s", fields[key]["wkB/s"], "rrqms/s", fields[key]["rrqms/s"], "wrqms/s", fields[key]["wrqms/s"], "io/s", fields[key]["io/s"], "util", fields[key]["util"]) + "\n" + # send data to InfluxDB + p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string), shell=True) + p.communicate() + influx_string = "" + except KeyboardInterrupt: + break + except Exception: + logging.error("iostat collection stopped unexpectedly with error: {}. Restarting process...".format(sys.exc_info())) + time.sleep(3) + + +# collects cpu load average information +def collectLoadavg(influx_info, node, ci): + logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) + logging.info("load_avg data starting collection with a collection interval of {}s".format(ci["load_avg"])) + measurement = "load_avg" + tags = {"node": node} + fields = {"load_avg": 0} + while True: + try: + fields["load_avg"] = os.getloadavg()[0] + p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{},'{}'='{}' '{}'='{}''".format(influx_info[0], influx_info[1], influx_info[2], measurement, "node", tags["node"], "load_avg", fields["load_avg"]), shell=True) + p.communicate() + time.sleep(ci["load_avg"]) + except KeyboardInterrupt: + break + except Exception: + logging.error("load_avg collection stopped unexpectedly with error: {}. Restarting process...".format(sys.exc_info())) + time.sleep(3) + + +# collects cpu utilization information +def collectOcctop(influx_info, node, ci, pc): + logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) + logging.info("occtop data starting collection with a collection interval of {}s".format(ci["occtop"])) + measurement = "occtop" + tags = {"node": node} + platform_cores = pc + influx_string = "" + while True: + try: + cpu = psutil.cpu_percent(percpu=True) + cpu_times = psutil.cpu_times_percent(percpu=True) + fields = {} + # sum all cpu percents + total = float(sum(cpu)) + sys_total = 0 + fields["platform_total"] = {"usage": 0, "system": 0} + cores = 0 + # for each core, get values and assign a tag + for el in cpu: + fields["usage"] = float(el) + fields["system"] = float(cpu_times[cores][2]) + sys_total += float(cpu_times[cores][2]) + tags["core"] = "core_{}".format(cores) + influx_string += "{},'{}'='{}','{}'='{}' '{}'='{}','{}'='{}'".format(measurement, "node", tags["node"], "core", tags["core"], "usage", fields["usage"], "system", fields["system"]) + "\n" + if len(platform_cores) > 0: + if cores in platform_cores: + fields["platform_total"]["usage"] += float(el) + fields["platform_total"]["system"] += float(cpu_times[cores][2]) + cores += 1 + # add usage and system total to influx string + if len(platform_cores) > 0: + influx_string += "{},'{}'='{}','{}'='{}' '{}'='{}','{}'='{}'".format(measurement, "node", tags["node"], "core", "platform_total", "usage", fields["platform_total"]["usage"], "system", fields["platform_total"]["system"]) + "\n" + influx_string += "{},'{}'='{}','{}'='{}' '{}'='{}','{}'='{}'".format(measurement, "node", tags["node"], "core", "total", "usage", total, "system", sys_total) + "\n" + # send data to Influx + p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string), shell=True) + p.communicate() + influx_string = "" + time.sleep(ci["occtop"]) + except KeyboardInterrupt: + break + except Exception: + logging.error("occtop collection stopped unexpectedly with error: {}. Restarting process...".format(sys.exc_info())) + time.sleep(3) + + +# collects network interface information +def collectNetstats(influx_info, node, ci): + logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) + logging.info("netstats data starting collection with a collection interval of {}s".format(ci["netstats"])) + measurement = "netstats" + tags = {"node": node} + fields = {} + prev_fields = {} + Mbps = float(1000000 / 8) + influx_string = "" + while True: + try: + net = psutil.net_io_counters(pernic=True) + # get initial data for difference calculation + for key in net: + prev_fields[key] = {"tx_B": net[key][0], "rx_B": net[key][1], "tx_p": net[key][2], "rx_p": net[key][3]} + start = time.time() + time.sleep(ci["netstats"]) + net = psutil.net_io_counters(pernic=True) + # get new data for difference calculation + dt = time.time() - start + for key in net: + tx_B = (float(net[key][0]) - float(prev_fields[key]["tx_B"])) + tx_Mbps = tx_B / Mbps / dt + rx_B = (float(net[key][1]) - float(prev_fields[key]["rx_B"])) + rx_Mbps = rx_B / Mbps / dt + tx_pps = (float(net[key][2]) - float(prev_fields[key]["tx_p"])) / dt + rx_pps = (float(net[key][3]) - float(prev_fields[key]["rx_p"])) / dt + # ensure no division by zero + if rx_B > 0 and rx_pps > 0: + rx_packet_size = rx_B / rx_pps + else: + rx_packet_size = 0 + if tx_B > 0 and tx_pps > 0: + tx_packet_size = tx_B / tx_pps + else: + tx_packet_size = 0 + fields[key] = {"tx_mbps": tx_Mbps, "rx_mbps": rx_Mbps, "tx_pps": tx_pps, "rx_pps": rx_pps, "tx_packet_size": tx_packet_size, "rx_packet_size": rx_packet_size} + for key in fields: + influx_string += "{},'{}'='{}','{}'='{}' '{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}'".format(measurement, "node", tags["node"], "interface", key, "rx_mbps", fields[key]["rx_mbps"], "tx_mbps", fields[key]["tx_mbps"], "rx_pps", fields[key]["rx_pps"], "tx_pps", fields[key]["tx_pps"], "rx_packet_size", fields[key]["rx_packet_size"], "tx_packet_size", fields[key]["tx_packet_size"]) + "\n" + # send data to InfluxDB + p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string), shell=True) + p.communicate() + influx_string = "" + except KeyboardInterrupt: + break + except Exception: + logging.error("netstats collection stopped unexpectedly with error: {}. Restarting process...".format(sys.exc_info())) + time.sleep(3) + + +# collects postgres db size and postgres service size information +def collectPostgres(influx_info, node, ci): + logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) + logging.info("postgres data starting collection with a collection interval of {}s".format(ci["postgres"])) + measurement = "postgres_db_size" + measurement1 = "postgres_svc_stats" + tags = {"node": node, "service": None, "table_schema": 0, "table": None} + fields = {"db_size": 0, "connections": 0} + fields1 = {"table_size": 0, "total_size": 0, "index_size": 0, "live_tuples": 0, "dead_tuples": 0} + postgres_output = postgres_output1 = None + influx_string = influx_string1 = "" + good_string = False + while True: + try: + # make sure this is active controller, otherwise postgres queries wont work + if isActiveController(): + while True: + # get list of databases and their sizes + postgres_output = Popen("sudo -u postgres psql --pset pager=off -q -t -c'SELECT datname, pg_database_size(datname) FROM pg_database WHERE datistemplate = false;'", shell=True, stdout=PIPE) + lines = postgres_output.stdout.read().replace(" ", "").strip().split("\n") + if lines == "" or lines is None: + postgres_output.kill() + break + else: + # for each database from the previous output + for line in lines: + if not line: + break + line = line.replace(" ", "").split("|") + tags["service"] = line[0] + fields["db_size"] = line[1] + # send DB size to InfluxDB + influx_string += "{},'{}'='{}','{}'='{}' '{}'='{}'".format(measurement, "node", tags["node"], "service", tags["service"], "db_size", fields["db_size"]) + "\n" + # get tables for each database + sql = "SELECT table_schema,table_name,pg_size_pretty(table_size) AS table_size,pg_size_pretty(indexes_size) AS indexes_size,pg_size_pretty(total_size) AS total_size,live_tuples,dead_tuples FROM (SELECT table_schema,table_name,pg_table_size(table_name) AS table_size,pg_indexes_size(table_name) AS indexes_size,pg_total_relation_size(table_name) AS total_size,pg_stat_get_live_tuples(table_name::regclass) AS live_tuples,pg_stat_get_dead_tuples(table_name::regclass) AS dead_tuples FROM (SELECT table_schema,table_name FROM information_schema.tables WHERE table_schema='public' AND table_type='BASE TABLE') AS all_tables ORDER BY total_size DESC) AS pretty_sizes;" + postgres_output1 = Popen('sudo -u postgres psql --pset pager=off -q -t -d{} -c"{}"'.format(line[0], sql), shell=True, stdout=PIPE) + lines = postgres_output1.stdout.read().replace(" ", "").strip().split("\n") + for line in lines: + if line == "": + continue + else: + line = line.replace(" ", "").split("|") + elements = list() + # ensures all data is present + if len(line) != 7: + good_string = False + break + else: + # do some conversions + for el in line: + if el.endswith("bytes"): + el = int(el.replace("bytes", "")) + elif el.endswith("kB"): + el = el.replace("kB", "") + el = int(el) * 1000 + elif el.endswith("MB"): + el = el.replace("MB", "") + el = int(el) * 1000000 + elif el.endswith("GB"): + el = el.replace("GB", "") + el = int(el) * 1000000000 + elements.append(el) + tags["table_schema"] = elements[0] + tags["table"] = elements[1] + fields1["table_size"] = int(elements[2]) + fields1["index_size"] = int(elements[3]) + fields1["total_size"] = int(elements[4]) + fields1["live_tuples"] = int(elements[5]) + fields1["dead_tuples"] = int(elements[6]) + influx_string1 += "{},'{}'='{}','{}'='{}','{}'='{}','{}'='{}' '{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}'".format(measurement1, "node", tags["node"], "service", tags["service"], "table_schema", tags["table_schema"], "table", tags["table"], "table_size", fields1["table_size"], "index_size", fields1["index_size"], "total_size", fields1["total_size"], "live_tuples", fields1["live_tuples"], "dead_tuples", fields1["dead_tuples"]) + "\n" + good_string = True + if good_string: + # send table data to InfluxDB + p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string), shell=True) + p.communicate() + p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string1), shell=True) + p.communicate() + influx_string = influx_string1 = "" + time.sleep(ci["postgres"]) + postgres_output1.kill() + postgres_output.kill() + else: + time.sleep(20) + except KeyboardInterrupt: + if postgres_output is not None: + postgres_output.kill() + if postgres_output1 is not None: + postgres_output1.kill() + break + except Exception: + logging.error("postgres collection stopped unexpectedly with error: {}. Restarting process...".format(sys.exc_info())) + time.sleep(3) + + +# collect postgres connections information +def collectPostgresConnections(influx_info, node, ci, fast): + logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) + if fast: + logging.info("postgres_connections data starting collection with a constant collection interval") + else: + logging.info("postgres_connections data starting collection with a collection interval of {}s".format(ci["postgres"])) + measurement = "postgres_connections" + tags = {"node": node, "service": None, "state": None} + connections_output = None + influx_string = "" + while True: + try: + # make sure this is active controller, otherwise postgres queries wont work + if isActiveController(): + while True: + fields = {} + # outputs a list of postgres dbs and their connections + connections_output = Popen("sudo -u postgres psql --pset pager=off -q -c 'SELECT datname,state,count(*) from pg_stat_activity group by datname,state;'", shell=True, stdout=PIPE) + line = connections_output.stdout.readline() + if line == "" or line is None: + break + # skip header + connections_output.stdout.readline() + while True: + line = connections_output.stdout.readline().strip("\n") + if not line: + break + else: + line = line.replace(" ", "").split("|") + if len(line) != 3: + continue + else: + svc = line[0] + connections = int(line[2]) + tags["service"] = svc + if svc not in fields: + fields[svc] = {"active": 0, "idle": 0, "other": 0} + if line[1] == "active": + fields[svc]["active"] = connections + elif line[1] == "idle": + fields[svc]["idle"] = connections + else: + fields[svc]["other"] = connections + influx_string += "{},'{}'='{}','{}'='{}','{}'='{}' '{}'='{}'".format(measurement, "node", tags["node"], "service", tags["service"], "state", "active", "connections", fields[svc]["active"]) + "\n" + influx_string += "{},'{}'='{}','{}'='{}','{}'='{}' '{}'='{}'".format(measurement, "node", tags["node"], "service", tags["service"], "state", "idle", "connections", fields[svc]["idle"]) + "\n" + influx_string += "{},'{}'='{}','{}'='{}','{}'='{}' '{}'='{}'".format(measurement, "node", tags["node"], "service", tags["service"], "state", "other", "connections", fields[svc]["other"]) + "\n" + + # send data to InfluxDB + p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string), shell=True) + p.communicate() + influx_string = "" + connections_output.kill() + if fast: + pass + else: + time.sleep(ci["postgres"]) + else: + time.sleep(20) + except KeyboardInterrupt: + if connections_output is not None: + connections_output.kill() + break + except Exception: + logging.error("postgres_connections collection stopped unexpectedly with error: {}. Restarting process...".format(sys.exc_info())) + time.sleep(3) + + +# collects rabbitmq information +def collectRabbitMq(influx_info, node, ci): + logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) + logging.info("rabbitmq data starting collection with a collection interval of {}s".format(ci["rabbitmq"])) + measurement = "rabbitmq" + tags = OrderedDict([("node", node)]) + rabbitmq_output = None + while True: + try: + # make sure this is active controller, otherwise rabbit queries wont work + if isActiveController(): + while True: + fields = OrderedDict([]) + rabbitmq_output = Popen("sudo rabbitmqctl -n rabbit@localhost status", shell=True, stdout=PIPE) + # needed data starts where output = '{memory,[' + line = rabbitmq_output.stdout.readline() + # if no data is returned, exit + if line == "" or line is None: + rabbitmq_output.kill() + break + else: + line = rabbitmq_output.stdout.read().strip("\n").split("{memory,[") + if len(line) != 2: + rabbitmq_output.kill() + break + else: + # remove brackets from data + info = line[1].replace(" ", "").replace("{", "").replace("}", "").replace("\n", "").replace("[", "").replace("]", "").split(",") + for i in range(len(info) - 3): + if info[i].endswith("total"): + info[i] = info[i].replace("total", "memory_total") + # some data needs string manipulation + if info[i].startswith("clustering") or info[i].startswith("amqp"): + info[i] = "listeners_" + info[i] + if info[i].startswith("total_"): + info[i] = "descriptors_" + info[i] + if info[i].startswith("limit") or info[i].startswith("used"): + info[i] = "processes_" + info[i] + if info[i].replace("_", "").isalpha() and info[i + 1].isdigit(): + fields[info[i]] = info[i + 1] + s = generateString(measurement, tags.keys(), tags.values(), fields.keys(), fields.values()) + if s is None: + rabbitmq_output.kill() + else: + # send data to InfluxDB + p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], s), shell=True) + p.communicate() + time.sleep(ci["rabbitmq"]) + rabbitmq_output.kill() + else: + time.sleep(20) + except KeyboardInterrupt: + if rabbitmq_output is not None: + rabbitmq_output.kill() + break + except Exception: + logging.error("rabbitmq collection stopped unexpectedly with error: {}. Restarting process...".format(sys.exc_info())) + time.sleep(3) + + +# collects rabbitmq messaging information +def collectRabbitMqSvc(influx_info, node, ci, services): + logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) + logging.info("rabbitmq_svc data starting collection with a collection interval of {}s".format(ci["rabbitmq"])) + measurement = "rabbitmq_svc" + tags = {"node": node, "service": None} + fields = {"messages": 0, "messages_ready": 0, "messages_unacknowledged": 0, "memory": 0, "consumers": 0} + rabbitmq_svc_output = None + good_string = False + influx_string = "" + while True: + try: + # make sure this is active controller, otherwise rabbit queries wont work + if isActiveController(): + while True: + rabbitmq_svc_output = Popen("sudo rabbitmqctl -n rabbit@localhost list_queues name messages messages_ready messages_unacknowledged memory consumers", shell=True, stdout=PIPE) + # # if no data is returned, exit + if rabbitmq_svc_output.stdout.readline() == "" or rabbitmq_svc_output.stdout.readline() is None: + rabbitmq_svc_output.kill() + break + else: + for line in rabbitmq_svc_output.stdout: + line = line.split() + if not line: + break + else: + if len(line) != 6: + good_string = False + break + else: + # read line and fill fields + if line[0] in services: + tags["service"] = line[0] + fields["messages"] = line[1] + fields["messages_ready"] = line[2] + fields["messages_unacknowledged"] = line[3] + fields["memory"] = line[4] + fields["consumers"] = line[5] + influx_string += "{},'{}'='{}','{}'='{}' '{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}'".format(measurement, "node", tags["node"], "service", tags["service"], "messages", fields["messages"], "messages_ready", fields["messages_ready"], "messages_unacknowledged", fields["messages_unacknowledged"], "memory", fields["memory"], "consumers", fields["consumers"]) + "\n" + good_string = True + if good_string: + # send data to InfluxDB + p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string), shell=True) + p.communicate() + influx_string = "" + time.sleep(ci["rabbitmq"]) + rabbitmq_svc_output.kill() + else: + time.sleep(20) + except KeyboardInterrupt: + if rabbitmq_svc_output is not None: + rabbitmq_svc_output.kill() + break + except Exception: + logging.error("rabbitmq_svc collection stopped unexpectedly with error: {}. Restarting process...".format(sys.exc_info())) + time.sleep(3) + + +# collects open file information +def collectFilestats(influx_info, node, ci, services, syseng_services, exclude_list, skip_list, collect_all): + logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) + logging.info("filestats data starting collection with a collection interval of {}s".format(ci["filestats"])) + measurement = "filestats" + tags = {"node": node} + influx_string = "" + while True: + try: + fields = {} + # fill dict with services from engtools.conf + if collect_all is False: + for svc in services: + fields[svc] = {"read/write": 0, "write": 0, "read": 0} + fields["static_syseng"] = {"read/write": 0, "write": 0, "read": 0} + fields["live_syseng"] = {"read/write": 0, "write": 0, "read": 0} + fields["total"] = {"read/write": 0, "write": 0, "read": 0} + for process in os.listdir("/proc/"): + if process.isdigit(): + # sometimes the process dies before reading its info + try: + svc = psutil.Process(int(process)).name() + svc = svc.split()[0].replace("(", "").replace(")", "").strip(":").split("/")[-1] + except Exception: + continue + if collect_all is False: + if svc in services: + try: + p = Popen("ls -l /proc/{}/fd".format(process), shell=True, stdout=PIPE) + p.stdout.readline() + while True: + line = p.stdout.readline().strip("\n").split() + if not line: + break + else: + priv = line[0] + if priv[1] == "r" and priv[2] == "w": + fields[svc]["read/write"] += 1 + fields["total"]["read/write"] += 1 + elif priv[1] == "r" and priv[2] != "w": + fields[svc]["read"] += 1 + fields["total"]["read"] += 1 + elif priv[1] != "r" and priv[2] == "w": + fields[svc]["write"] += 1 + fields["total"]["write"] += 1 + except Exception: + p.kill() + continue + p.kill() + + elif svc in syseng_services: + try: + p = Popen("ls -l /proc/{}/fd".format(process), shell=True, stdout=PIPE) + p.stdout.readline() + while True: + line = p.stdout.readline().strip("\n").split() + if not line: + break + else: + priv = line[0] + if svc == "live_stream.py": + if priv[1] == "r" and priv[2] == "w": + fields["live_syseng"]["read/write"] += 1 + fields["total"]["read/write"] += 1 + elif priv[1] == "r" and priv[2] != "w": + fields["live_syseng"]["read"] += 1 + fields["total"]["read"] += 1 + elif priv[1] != "r" and priv[2] == "w": + fields["live_syseng"]["write"] += 1 + fields["total"]["write"] += 1 + else: + if priv[1] == "r" and priv[2] == "w": + fields["static_syseng"]["read/write"] += 1 + fields["total"]["read/write"] += 1 + elif priv[1] == "r" and priv[2] != "w": + fields["static_syseng"]["read"] += 1 + fields["total"]["read"] += 1 + elif priv[1] != "r" and priv[2] == "w": + fields["static_syseng"]["write"] += 1 + fields["total"]["write"] += 1 + except Exception: + p.kill() + continue + p.kill() + + else: + # remove garbage processes + if svc in exclude_list or svc in skip_list or svc.startswith("-") or svc.endswith("-") or svc[0].isdigit() or svc[-1].isdigit() or svc[0].isupper(): + continue + elif svc not in fields: + fields[svc] = {"read/write": 0, "write": 0, "read": 0} + try: + p = Popen("ls -l /proc/{}/fd".format(process), shell=True, stdout=PIPE) + p.stdout.readline() + while True: + line = p.stdout.readline().strip("\n").split() + if not line: + break + else: + priv = line[0] + if priv[1] == "r" and priv[2] == "w": + fields[svc]["read/write"] += 1 + fields["total"]["read/write"] += 1 + elif priv[1] == "r" and priv[2] != "w": + fields[svc]["read"] += 1 + fields["total"]["read"] += 1 + elif priv[1] != "r" and priv[2] == "w": + fields[svc]["write"] += 1 + fields["total"]["write"] += 1 + if fields[svc]["read/write"] == 0 and fields[svc]["read"] == 0 and fields[svc]["write"] == 0: + del fields[svc] + except Exception: + p.kill() + continue + p.kill() + for key in fields.keys(): + influx_string += "{},'{}'='{}','{}'='{}' '{}'='{}','{}'='{}','{}'='{}'".format(measurement, "node", tags["node"], "service", key, "read/write", fields[key]["read/write"], "write", fields[key]["write"], "read", fields[key]["read"]) + "\n" + # send data to InfluxDB + p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string), shell=True) + p.communicate() + influx_string = "" + time.sleep(ci["filestats"]) + except KeyboardInterrupt: + break + except Exception: + logging.error("filestats collection stopped unexpectedly with error: {}. Restarting process...".format(sys.exc_info())) + time.sleep(3) + + +# collects vshell information +def collectVswitch(influx_info, node, ci): + logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) + logging.info("vswitch data starting collection with a collection interval of {}s".format(ci["vswitch"])) + measurement = "vswitch" + tags = OrderedDict([("node", node), ("engine", 0)]) + tags1 = OrderedDict([("node", node), ("port", 0)]) + tags2 = OrderedDict([("node", node), ("interface", 0)]) + fields = OrderedDict([("cpuid", 0), ("rx_packets", 0), ("tx_packets", 0), ("rx_discard", 0), ("tx_discard", 0), ("tx_disabled", 0), ("tx_overflow", 0), ("tx_timeout", 0), ("usage", 0)]) + fields1 = OrderedDict([("rx_packets", 0), ("tx_packets", 0), ("rx_bytes", 0), ("tx_bytes", 0), ("tx_errors", 0), ("rx_errors", 0), ("rx_nombuf", 0)]) + fields2 = OrderedDict([("rx_packets", 0), ("tx_packets", 0), ("rx_bytes", 0), ("tx_bytes", 0), ("tx_errors", 0), ("rx_errors", 0), ("tx_discards", 0), ("rx_discards", 0), ("rx_floods", 0), ("rx_no_vlan", 0)]) + vshell_engine_stats_output = vshell_port_stats_output = vshell_interface_stats_output = None + influx_string = "" + while True: + try: + vshell_engine_stats_output = Popen("vshell engine-stats-list", shell=True, stdout=PIPE) + # skip first few lines + vshell_engine_stats_output.stdout.readline() + vshell_engine_stats_output.stdout.readline() + vshell_engine_stats_output.stdout.readline() + while True: + line = vshell_engine_stats_output.stdout.readline().replace("|", "").split() + if not line: + break + # skip lines like +++++++++++++++++++++++++++++ + elif line[0].startswith("+"): + continue + else: + # get info from output + i = 2 + tags["engine"] = line[1] + for key in fields: + fields[key] = line[i].strip("%") + i += 1 + influx_string += "{},'{}'='{}','{}'='{}' '{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}'".format(measurement, tags.keys()[0], tags.values()[0], tags.keys()[1], tags.values()[1], fields.keys()[0], fields.values()[0], fields.keys()[1], fields.values()[1], fields.keys()[2], fields.values()[2], fields.keys()[3], fields.values()[3], fields.keys()[4], fields.values()[4], fields.keys()[5], fields.values()[5], fields.keys()[6], fields.values()[6], fields.keys()[7], fields.values()[7], fields.keys()[8], fields.values()[8]) + "\n" + vshell_engine_stats_output.kill() + vshell_port_stats_output = Popen("vshell port-stats-list", shell=True, stdout=PIPE) + vshell_port_stats_output.stdout.readline() + vshell_port_stats_output.stdout.readline() + vshell_port_stats_output.stdout.readline() + while True: + line = vshell_port_stats_output.stdout.readline().replace("|", "").split() + if not line: + break + elif line[0].startswith("+"): + continue + else: + i = 3 + tags1["port"] = line[1] + for key in fields1: + fields1[key] = line[i].strip("%") + i += 1 + influx_string += "{},'{}'='{}','{}'='{}' '{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}'".format(measurement, tags1.keys()[0], tags1.values()[0], tags1.keys()[1], tags1.values()[1], fields1.keys()[0], fields1.values()[0], fields1.keys()[1], fields1.values()[1], fields1.keys()[2], fields1.values()[2], fields1.keys()[3], fields1.values()[3], fields1.keys()[4], fields1.values()[4], fields1.keys()[5], fields1.values()[5], fields1.keys()[6], fields1.values()[6]) + "\n" + vshell_port_stats_output.kill() + vshell_interface_stats_output = Popen("vshell interface-stats-list", shell=True, stdout=PIPE) + vshell_interface_stats_output.stdout.readline() + vshell_interface_stats_output.stdout.readline() + vshell_interface_stats_output.stdout.readline() + while True: + line = vshell_interface_stats_output.stdout.readline().replace("|", "").split() + if not line: + break + elif line[0].startswith("+"): + continue + else: + if line[2] == "ethernet" and line[3].startswith("eth"): + i = 4 + tags2["interface"] = line[3] + for key in fields2: + fields2[key] = line[i].strip("%") + i += 1 + influx_string += "{},'{}'='{}','{}'='{}' '{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}','{}'='{}'".format(measurement, tags2.keys()[0], tags2.values()[0], tags2.keys()[1], tags2.values()[1], fields2.keys()[0], fields2.values()[0], fields2.keys()[1], fields2.values()[1], fields2.keys()[2], fields2.values()[2], fields2.keys()[3], fields2.values()[3], fields2.keys()[4], fields2.values()[4], fields2.keys()[5], fields2.values()[5], fields2.keys()[6], fields2.values()[6], fields2.keys()[7], fields2.values()[7], fields2.keys()[8], fields2.values()[8], fields2.keys()[9], fields2.values()[9]) + "\n" + else: + continue + vshell_interface_stats_output.kill() + # send data to InfluxDB + p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string), shell=True) + p.communicate() + influx_string = "" + time.sleep(ci["vswitch"]) + except KeyboardInterrupt: + if vshell_engine_stats_output is not None: + vshell_engine_stats_output.kill() + if vshell_port_stats_output is not None: + vshell_port_stats_output.kill() + if vshell_interface_stats_output is not None: + vshell_interface_stats_output.kill() + break + except Exception: + logging.error("vswitch collection stopped unexpectedly with error: {}. Restarting process...".format(sys.exc_info())) + time.sleep(3) + + +# collects the number of cores +def collectCpuCount(influx_info, node, ci): + logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) + logging.info("cpu_count data starting collection with a collection interval of {}s".format(ci["cpu_count"])) + measurement = "cpu_count" + tags = {"node": node} + while True: + try: + fields = {"cpu_count": cpu_count()} + p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{},'{}'='{}' '{}'='{}''".format(influx_info[0], influx_info[1], influx_info[2], measurement, "node", tags["node"], "cpu_count", fields["cpu_count"]), shell=True) + p.communicate() + time.sleep(ci["cpu_count"]) + except KeyboardInterrupt: + break + except Exception: + logging.error("cpu_count collection stopped unexpectedly with error: {}. Restarting process...".format(sys.exc_info())) + + +# collect API GET and POST requests/sec +def collectApi(influx_info, node, ci, openstack_svcs): + logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) + logging.info("api_request data starting collection with a collection interval of {}s".format(ci["cpu_count"])) + measurement = "api_requests" + tags = {"node": node} + openstack_services = openstack_svcs + influx_string = "" + while True: + try: + fields = {} + tmp = {} + tmp1 = {} + # get initial values + for s in openstack_services: + fields[s] = {"get": 0, "post": 0} + tmp[s] = {"get": 0, "post": 0} + log = "/var/log/{0}/{0}-api.log".format(s) + if os.path.exists(log): + if s == "ceilometer": + p = Popen("awk '/INFO/ && /500/' {} | wc -l".format(log), shell=True, stdout=PIPE) + else: + p = Popen("awk '/INFO/ && /GET/' {} | wc -l".format(log), shell=True, stdout=PIPE) + init_api_get = int(p.stdout.readline()) + tmp[s]["get"] = init_api_get + p.kill() + p = Popen("awk '/INFO/ && /POST/' {} | wc -l".format(log), shell=True, stdout=PIPE) + init_api_post = int(p.stdout.readline()) + tmp[s]["post"] = init_api_post + p.kill() + time.sleep(1) + # get new values + for s in openstack_services: + tmp1[s] = {"get": 0, "post": 0} + log = "/var/log/{0}/{0}-api.log".format(s) + if os.path.exists(log): + if s == "ceilometer": + p = Popen("awk '/INFO/ && /500/' {} | wc -l".format(log), shell=True, stdout=PIPE) + else: + p = Popen("awk '/INFO/ && /GET/' {} | wc -l".format(log), shell=True, stdout=PIPE) + api_get = int(p.stdout.readline()) + tmp1[s]["get"] = api_get + p.kill() + p = Popen("awk '/INFO/ && /POST/' {} | wc -l".format(log), shell=True, stdout=PIPE) + api_post = int(p.stdout.readline()) + tmp1[s]["post"] = api_post + p.kill() + # take difference + for key in fields: + if (key in tmp and key in tmp1) and (tmp1[key]["get"] >= tmp[key]["get"]) and (tmp1[key]["post"] >= tmp[key]["post"]): + fields[key]["get"] = (tmp1[key]["get"] - tmp[key]["get"]) + fields[key]["post"] = (tmp1[key]["post"] - tmp[key]["post"]) + influx_string += "{},'{}'='{}','{}'='{}' '{}'='{}','{}'='{}'".format(measurement, "node", tags["node"], "service", key, "get_requests", fields[key]["get"], "post_requests", fields[key]["post"]) + "\n" + p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string), shell=True) + p.communicate() + influx_string = "" + except KeyboardInterrupt: + break + except Exception: + logging.error("api_request collection stopped unexpectedly with error: {}. Restarting process...".format(sys.exc_info())) + time.sleep(3) + + +# returns the cores dedicated to platform use +def getPlatformCores(node, cpe): + if cpe is True or node.startswith("compute"): + logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) + core_list = list() + try: + with open("/etc/nova/compute_reserved.conf", "r") as f: + for line in f: + if line.startswith("PLATFORM_CPU_LIST"): + core_list = line.split("=")[1].replace("\"", "").strip("\n").split(",") + core_list = [int(x) for x in core_list] + return core_list + except Exception: + logging.warning("skipping platform specific collection for {} due to error: {}".format(node, sys.exc_info())) + return core_list + else: + return [] + + +# determine if controller is active/standby +def isActiveController(): + logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) + try: + o = Popen("sm-dump", shell=True, stdout=PIPE) + o.stdout.readline() + o.stdout.readline() + # read line for active/standby + l = o.stdout.readline().strip("\n").split() + per = l[1] + o.kill() + if per == "active": + return True + else: + return False + except Exception: + if o is not None: + o.kill() + logging.error("sm-dump command could not be called properly. This is usually caused by a swact. Trying again on next call: {}".format(sys.exc_info())) + return False + + +# checks whether the duration param has been set. If set, sleep; then kill processes upon waking up +def checkDuration(duration): + logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) + if duration is None: + return None + else: + time.sleep(duration) + print "Duration interval has ended. Killing processes now" + logging.warning("Duration interval has ended. Killing processes now") + raise KeyboardInterrupt + + +# kill all processes and log each death +def killProcesses(tasks): + logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) + for t in tasks: + try: + logging.info("{} data stopped collection".format(str(t.name))) + t.terminate() + except Exception: + continue + + +# create database in InfluxDB and add it to Grafana +def createDB(influx_info, grafana_port, grafana_api_key): + logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) + p = None + try: + logging.info("Adding database to InfluxDB and Grafana") + # create database in InfluxDB if not already created. Will NOT overwrite previous db + p = Popen("curl -s -XPOST 'http://'{}':'{}'/query' --data-urlencode 'q=CREATE DATABASE {}'".format(influx_info[0], influx_info[1], influx_info[2]), shell=True, stdout=PIPE) + response = p.stdout.read().strip("\n") + if response == "": + raise Exception("An error occurred while creating the database: Please make sure the Grafana and InfluxDB services are running") + else: + logging.info("InfluxDB response: {}".format(response)) + p.kill() + + # add database to Grafana + grafana_db = '{"name":"%s", "type":"influxdb", "url":"http://%s:%s", "access":"proxy", "isDefault":false, "database":"%s"}' % (influx_info[2], influx_info[0], influx_info[1], influx_info[2]) + p = Popen("curl -s 'http://{}:{}/api/datasources' -H 'Accept: application/json' -H 'Content-Type: application/json' -H 'Authorization: Bearer {}' --data-binary '{}'".format(influx_info[0], grafana_port, grafana_api_key, grafana_db), shell=True, stdout=PIPE) + response = p.stdout.read().strip("\n") + if response == "": + raise Exception("An error occurred while creating the database: Please make sure the Grafana and InfluxDB services are running") + else: + logging.info("Grafana response: {}".format(response)) + p.kill() + except KeyboardInterrupt: + if p is not None: + p.kill() + except Exception as e: + print e.message + sys.exit(0) + + +# delete database from InfluxDB and remove it from Grafana +def deleteDB(influx_info, grafana_port, grafana_api_key): + logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) + p = None + try: + answer = str(raw_input("\nAre you sure you would like to delete {}? (Y/N): ".format(influx_info[2]))).lower() + except Exception: + answer = None + if answer is None or answer == "" or answer == "y" or answer == "yes": + try: + logging.info("Removing database from InfluxDB and Grafana") + print "Removing database from InfluxDB and Grafana. Please wait..." + # delete database from InfluxDB + p = Popen("curl -s -XPOST 'http://'{}':'{}'/query' --data-urlencode 'q=DROP DATABASE {}'".format(influx_info[0], influx_info[1], influx_info[2]), shell=True, stdout=PIPE) + response = p.stdout.read().strip("\n") + if response == "": + raise Exception("An error occurred while removing the database: Please make sure the Grafana and InfluxDB services are running") + else: + logging.info("InfluxDB response: {}".format(response)) + p.kill() + + # get database ID for db removal + p = Popen("curl -s -G 'http://{}:{}/api/datasources/id/{}' -H 'Accept: application/json' -H 'Content-Type: application/json' -H 'Authorization: Bearer {}'".format(influx_info[0], grafana_port, influx_info[2], grafana_api_key), shell=True, stdout=PIPE) + id = p.stdout.read().split(":")[1].strip("}") + if id == "": + raise Exception("An error occurred while removing the database: Could not determine the database ID") + p.kill() + + # remove database from Grafana + p = Popen("curl -s -XDELETE 'http://{}:{}/api/datasources/{}' -H 'Accept: application/json' -H 'Content-Type: application/json' -H 'Authorization: Bearer {}'".format(influx_info[0], grafana_port, id, grafana_api_key), shell=True, stdout=PIPE) + response = p.stdout.read().strip("\n") + if response == "": + raise Exception("An error occurred while removing the database: Please make sure the Grafana and InfluxDB services are running") + else: + logging.info("Grafana response: {}".format(response)) + p.kill() + except KeyboardInterrupt: + if p is not None: + p.kill() + except Exception as e: + print e.message + sys.exit(0) + + +# used for output log +def appendToFile(file, content): + with open(file, "a") as f: + fcntl.flock(f, fcntl.LOCK_EX) + f.write(content + '\n') + fcntl.flock(f, fcntl.LOCK_UN) + + +# main program +if __name__ == "__main__": + # make sure user is root + if os.geteuid() != 0: + print "Must be run as root!\n" + sys.exit(0) + + # initialize variables + cpe_lab = False + influx_ip = influx_port = influx_db = "" + external_if = "" + influx_info = list() + grafana_port = "" + grafana_api_key = "" + controller_services = list() + compute_services = list() + storage_services = list() + rabbit_services = list() + common_services = list() + services = {} + live_svc = ("live_stream.py",) + static_svcs = ("occtop", "memtop", "schedtop", "top.sh", "iostat.sh", "netstats.sh", "diskstats.sh", "memstats.sh", "filestats.sh", "ceph.sh", "postgres.sh", "rabbitmq.sh", "vswitch.sh") + collection_intervals = {"memtop": None, "memstats": None, "occtop": None, "schedtop": None, "load_avg": None, "cpu_count": None, "diskstats": None, "iostat": None, "filestats": None, "netstats": None, "postgres": None, "rabbitmq": None, "vswitch": None} + openstack_services = ("nova", "cinder", "aodh", "ceilometer", "heat", "glance", "ceph", "horizon", "keystone", "puppet", "sysinv", "neutron", "nova_api", "postgres") + # memstats, schedtop, and filestats must skip/exclude certain fields when collect_all is enabled. No need to collect this stuff + exclude_list = ("python", "python2", "bash", "perl", "sudo", "init") + skip_list = ("ps", "top", "sh", "", "curl", "awk", "wc", "sleep", "lsof", "cut", "grep", "ip", "tail", "su") + duration = None + unconverted_duration = "" + collect_api_requests = False + api_requests = "" + auto_delete_db = False + delete_db = "" + collect_all_services = False + all_services = "" + fast_postgres_connections = False + fast_postgres = "" + config = ConfigParser.ConfigParser() + + node = os.popen("hostname").read().strip("\n") + + # get info from engtools.conf + try: + conf_file = "" + if "engtools.conf" in tuple(os.listdir(os.getcwd())): + conf_file = os.getcwd() + "/engtools.conf" + elif "engtools.conf" in tuple(os.listdir("/etc/engtools/")): + conf_file = "/etc/engtools/engtools.conf" + config.read(conf_file) + if config.get("LabConfiguration", "CPE_LAB").lower() == "y" or config.get("LabConfiguration", "CPE_LAB").lower() == "yes": + cpe_lab = True + if node.startswith("controller"): + external_if = config.get("CollectInternal", "{}_EXTERNAL_INTERFACE".format(node.upper().replace("-", ""))) + influx_ip = config.get("RemoteServer", "INFLUX_IP") + influx_port = config.get("RemoteServer", "INFLUX_PORT") + influx_db = config.get("RemoteServer", "INFLUX_DB") + grafana_port = config.get("RemoteServer", "GRAFANA_PORT") + grafana_api_key = config.get("RemoteServer", "GRAFANA_API_KEY") + duration = config.get("LiveStream", "DURATION") + unconverted_duration = config.get("LiveStream", "DURATION") + api_requests = config.get("AdditionalOptions", "API_REQUESTS") + delete_db = config.get("AdditionalOptions", "AUTO_DELETE_DB") + all_services = config.get("AdditionalOptions", "ALL_SERVICES") + fast_postgres = config.get("AdditionalOptions", "FAST_POSTGRES_CONNECTIONS") + # additional options + if api_requests.lower() == "y" or api_requests.lower() == "yes": + collect_api_requests = True + if delete_db.lower() == "y" or delete_db.lower() == "yes": + auto_delete_db = True + if all_services.lower() == "y" or all_services.lower() == "yes": + collect_all_services = True + if fast_postgres.lower() == "y" or fast_postgres.lower() == "yes": + fast_postgres_connections = True + # convert duration into seconds + if duration == "": + duration = None + elif duration.endswith("s") or duration.endswith("S"): + duration = duration.strip("s") + duration = duration.strip("S") + duration = int(duration) + elif duration.endswith("m") or duration.endswith("M"): + duration = duration.strip("m") + duration = duration.strip("M") + duration = int(duration) * 60 + elif duration.endswith("h") or duration.endswith("H"): + duration = duration.strip("h") + duration = duration.strip("H") + duration = int(duration) * 3600 + elif duration.endswith("d") or duration.endswith("D"): + duration = duration.strip("d") + duration = duration.strip("D") + duration = int(duration) * 3600 * 24 + controller_services = tuple(config.get("ControllerServices", "CONTROLLER_SERVICE_LIST").split()) + compute_services = tuple(config.get("ComputeServices", "COMPUTE_SERVICE_LIST").split()) + storage_services = tuple(config.get("StorageServices", "STORAGE_SERVICE_LIST").split()) + rabbit_services = tuple(config.get("RabbitmqServices", "RABBITMQ_QUEUE_LIST").split()) + common_services = tuple(config.get("CommonServices", "COMMON_SERVICE_LIST").split()) + # get collection intervals + for i in config.options("Intervals"): + if config.get("Intervals", i) == "" or config.get("Intervals", i) is None: + collection_intervals[i] = None + else: + collection_intervals[i] = int(config.get("Intervals", i)) + except Exception: + print "An error has occurred when parsing the engtools.conf configuration file: {}".format(sys.exc_info()) + sys.exit(0) + + syseng_services = live_svc + static_svcs + if cpe_lab is True: + services["controller_services"] = controller_services + compute_services + storage_services + common_services + else: + controller_services += common_services + compute_services += common_services + storage_services += common_services + services["controller_services"] = controller_services + services["compute_services"] = compute_services + services["storage_services"] = storage_services + services["common_services"] = common_services + services["syseng_services"] = syseng_services + services["rabbit_services"] = rabbit_services + + influx_info.append(influx_ip) + influx_info.append(influx_port) + influx_info.append(influx_db) + + # add config options to log + with open("/tmp/livestream.log", "w") as e: + e.write("Configuration for {}:\n".format(node)) + e.write("-InfluxDB address: {}:{}\n".format(influx_ip, influx_port)) + e.write("-InfluxDB name: {}\n".format(influx_db)) + e.write("-CPE lab: {}\n".format(str(cpe_lab))) + e.write(("-Collect API requests: {}\n".format(str(collect_api_requests)))) + e.write(("-Collect all services: {}\n".format(str(collect_all_services)))) + e.write(("-Fast postgres connections: {}\n".format(str(fast_postgres_connections)))) + e.write(("-Automatic database removal: {}\n".format(str(auto_delete_db)))) + if duration is not None: + e.write("-Live stream duration: {}\n".format(unconverted_duration)) + e.close() + + # add POSTROUTING entry to NAT table + if cpe_lab is False: + # check controller-0 for NAT entry. If not there, add it + if node.startswith("controller"): + # use first interface if not specified in engtools.conf + if external_if == "" or external_if is None: + p = Popen("ifconfig", shell=True, stdout=PIPE) + external_if = p.stdout.readline().split(":")[0] + p.kill() + appendToFile("/tmp/livestream.log", "-External interface for {}: {}".format(node, external_if)) + # enable IP forwarding + p = Popen("sysctl -w net.ipv4.ip_forward=1 > /dev/null", shell=True) + p.communicate() + p = Popen("iptables -t nat -L --line-numbers", shell=True, stdout=PIPE) + tmp = [] + # entries need to be removed in reverse order + for line in p.stdout: + tmp.append(line.strip("\n")) + for line in reversed(tmp): + l = " ".join(line.strip("\n").split()[1:]) + # if an entry already exists, remove it + if l.startswith("MASQUERADE tcp -- anywhere"): + line_number = line.strip("\n").split()[0] + p1 = Popen("iptables -t nat -D POSTROUTING {}".format(line_number), shell=True) + p1.communicate() + p.kill() + appendToFile("/tmp/livestream.log", "-Adding NAT information to allow compute/storage nodes to communicate with remote server\n") + # add new entry for both InfluxDB and Grafana + p = Popen("iptables -t nat -A POSTROUTING -p tcp -o {} -d {} --dport {} -j MASQUERADE".format(external_if, influx_ip, influx_port), shell=True) + p.communicate() + p = Popen("iptables -t nat -A POSTROUTING -p tcp -o {} -d {} --dport {} -j MASQUERADE".format(external_if, influx_ip, grafana_port), shell=True) + p.communicate() + + appendToFile("/tmp/livestream.log", "\nStarting collection at {}\n".format(datetime.datetime.utcnow())) + tasks = [] + + createDB(influx_info, grafana_port, grafana_api_key) + + try: + node_type = str(node.split("-")[0]) + # if not a standard node, run the common functions with collect_all enabled + if node_type != "controller" and node_type != "compute" and node_type != "storage": + node_type = "common" + collect_all_services = True + + if collection_intervals["memstats"] is not None: + p = Process(target=collectMemstats, args=(influx_info, node, collection_intervals, services["{}_services".format(node_type)], services["syseng_services"], openstack_services, exclude_list, skip_list, collect_all_services), name="memstats") + tasks.append(p) + p.start() + if collection_intervals["schedtop"] is not None: + p = Process(target=collectSchedtop, args=(influx_info, node, collection_intervals, services["{}_services".format(node_type)], services["syseng_services"], openstack_services, exclude_list, skip_list, collect_all_services), name="schedtop") + tasks.append(p) + p.start() + if collection_intervals["filestats"] is not None: + p = Process(target=collectFilestats, args=(influx_info, node, collection_intervals, services["{}_services".format(node_type)], services["syseng_services"], exclude_list, skip_list, collect_all_services), name="filestats") + tasks.append(p) + p.start() + if collection_intervals["occtop"] is not None: + p = Process(target=collectOcctop, args=(influx_info, node, collection_intervals, getPlatformCores(node, cpe_lab)), name="occtop") + tasks.append(p) + p.start() + if collection_intervals["load_avg"] is not None: + p = Process(target=collectLoadavg, args=(influx_info, node, collection_intervals), name="load_avg") + tasks.append(p) + p.start() + if collection_intervals["cpu_count"] is not None: + p = Process(target=collectCpuCount, args=(influx_info, node, collection_intervals), name="cpu_count") + tasks.append(p) + p.start() + if collection_intervals["memtop"] is not None: + p = Process(target=collectMemtop, args=(influx_info, node, collection_intervals), name="memtop") + tasks.append(p) + p.start() + if collection_intervals["diskstats"] is not None: + p = Process(target=collectDiskstats, args=(influx_info, node, collection_intervals), name="diskstats") + tasks.append(p) + p.start() + if collection_intervals["iostat"] is not None: + p = Process(target=collectIostat, args=(influx_info, node, collection_intervals), name="iostat") + tasks.append(p) + p.start() + if collection_intervals["netstats"] is not None: + p = Process(target=collectNetstats, args=(influx_info, node, collection_intervals), name="netstats") + tasks.append(p) + p.start() + if collect_api_requests is True and node_type == "controller": + p = Process(target=collectApi, args=(influx_info, node, collection_intervals, openstack_services), name="api_requests") + tasks.append(p) + p.start() + + if node_type == "controller": + if collection_intervals["postgres"] is not None: + p = Process(target=collectPostgres, args=(influx_info, node, collection_intervals), name="postgres") + tasks.append(p) + p.start() + p = Process(target=collectPostgresConnections, args=(influx_info, node, collection_intervals, fast_postgres_connections), name="postgres_connections") + tasks.append(p) + p.start() + if collection_intervals["rabbitmq"] is not None: + p = Process(target=collectRabbitMq, args=(influx_info, node, collection_intervals), name="rabbitmq") + tasks.append(p) + p.start() + p = Process(target=collectRabbitMqSvc, args=(influx_info, node, collection_intervals, services["rabbit_services"]), name="rabbitmq_svc") + tasks.append(p) + p.start() + + if node_type == "compute" or cpe_lab is True: + if collection_intervals["vswitch"] is not None: + p = Process(target=collectVswitch, args=(influx_info, node, collection_intervals), name="vswitch") + tasks.append(p) + p.start() + + print "Sending data to InfluxDB. Please tail /tmp/livestream.log" + + checkDuration(duration) + # give a small delay to ensure services have started + time.sleep(3) + for t in tasks: + os.wait() + except KeyboardInterrupt: + pass + finally: + # end here once duration param has ended or ctrl-c is pressed + appendToFile("/tmp/livestream.log", "\nEnding collection at {}\n".format(datetime.datetime.utcnow())) + if tasks is not None and len(tasks) > 0: + killProcesses(tasks) + if auto_delete_db is True: + deleteDB(influx_info, grafana_port, grafana_api_key) + sys.exit(0) diff --git a/tools/engtools/hostdata-collectors/scripts/memstats.sh b/tools/engtools/hostdata-collectors/scripts/memstats.sh new file mode 100644 index 000000000..664b10b46 --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/memstats.sh @@ -0,0 +1,112 @@ +#!/bin/bash +# Usage: memstats.sh [-p ] [-i ] [-c ] [-h] +TOOLBIN=$(dirname $0) + +# Initialize tools environment variables, and define common utility functions +. ${TOOLBIN}/engtools_util.sh +tools_init +if [ $? -ne 0 ]; then + echo "FATAL, tools_init - could not setup environment" + exit $? +fi + +PAGE_SIZE=$(getconf PAGE_SIZE) + +# Enable use of INTERVAL_SEC sample interval +OPT_USE_INTERVALS=1 + +# Print key networking device statistics +function print_memory() +{ + # Configuration for netcmds + MEMINFO=/proc/meminfo + NODEINFO=/sys/devices/system/node/node?/meminfo + BUDDYINFO=/proc/buddyinfo + SLABINFO=/proc/slabinfo + + print_separator + TOOL_HIRES_TIME + + ${ECHO} "# ${MEMINFO}" + ${CAT} ${MEMINFO} + ${ECHO} + + ${ECHO} "# ${NODEINFO}" + ${CAT} ${NODEINFO} + ${ECHO} + + ${ECHO} "# ${BUDDYINFO}" + ${CAT} ${BUDDYINFO} + ${ECHO} + + ${ECHO} "# PSS" + cat /proc/*/smaps 2>/dev/null | \ + awk '/^Pss:/ {a += $2;} END {printf "%d MiB\n", a/1024.0;}' + ${ECHO} + + # use old slabinfo format (i.e. slub not enabled in kernel) + ${ECHO} "# ${SLABINFO}" + ${CAT} ${SLABINFO} | \ + awk -v page_size_B=${PAGE_SIZE} ' +BEGIN {page_KiB = page_size_B/1024; TOT_KiB = 0;} +(NF == 17) { + gsub(/[<>]/, ""); + printf("%-22s %11s %8s %8s %10s %12s %1s %5s %10s %12s %1s %12s %9s %11s %8s\n", + $2, $3, $4, $5, $6, $7, $8, $10, $11, $12, $13, $15, $16, $17, "KiB"); +} +(NF == 16) { + num_objs=$3; obj_per_slab=$5; pages_per_slab=$6; + KiB = (obj_per_slab > 0) ? page_KiB*num_objs/obj_per_slab*pages_per_slab : 0; + TOT_KiB += KiB; + printf("%-22s %11d %8d %8d %10d %12d %1s %5d %10d %12d %1s %12d %9d %11d %8d\n", + $1, $2, $3, $4, $5, $6, $7, $9, $10, $11, $12, $14, $15, $16, KiB); +} +END { + printf("%-22s %11s %8s %8s %10s %12s %1s %5s %10s %12s %1s %12s %9s %11s %8d\n", + "TOTAL", "-", "-", "-", "-", "-", ":", "-", "-", "-", ":", "-", "-", "-", TOT_KiB); +} +' 2>/dev/null + ${ECHO} + + ${ECHO} "# disk usage: rootfs, tmpfs" + cmd='df -h -H -T --local -t rootfs -t tmpfs' + ${ECHO} "Disk space usage rootfs,tmpfs (SI):" + ${ECHO} "${cmd}" + ${cmd} + ${ECHO} + + CMD='ps -e -o ppid,pid,nlwp,rss:10,vsz:10,cmd --sort=-rss' + ${ECHO} "# ${CMD}" + ${CMD} + ${ECHO} +} + +#------------------------------------------------------------------------------- +# MAIN Program: +#------------------------------------------------------------------------------- +# Parse input options +tools_parse_options "${@}" + +# Set affinity of current script +CPULIST="" +set_affinity ${CPULIST} + +LOG "collecting ${TOOLNAME} for ${PERIOD_MIN} minutes, with ${INTERVAL_SEC} second sample intervals." + +# Print tools generic tools header +tools_header + +# Calculate number of sample repeats based on overall interval and sampling interval +((REPEATS = PERIOD_MIN * 60 / INTERVAL_SEC)) + +for ((rep=1; rep <= REPEATS ; rep++)) +do + print_memory + sleep ${INTERVAL_SEC} +done +print_memory +LOG "done" + +# normal program exit +tools_cleanup 0 +exit 0 diff --git a/tools/engtools/hostdata-collectors/scripts/netstats.sh b/tools/engtools/hostdata-collectors/scripts/netstats.sh new file mode 100644 index 000000000..4ed13b04e --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/netstats.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# Usage: netstats.sh [-p ] [-i ] [-c ] [-h] +TOOLBIN=$(dirname $0) + +# Initialize tools environment variables, and define common utility functions +. ${TOOLBIN}/engtools_util.sh +tools_init +if [ $? -ne 0 ]; then + echo "FATAL, tools_init - could not setup environment" + exit $? +fi + +# Enable use of INTERVAL_SEC sample interval +OPT_USE_INTERVALS=1 + +# Print key networking device statistics +function print_netcmds() +{ + # Configuration for netcmds + DEV=/proc/net/dev + NETSTAT=/proc/net/netstat + + print_separator + TOOL_HIRES_TIME + + for net in \ + ${DEV} ${NETSTAT} + do + if [ -e "${net}" ] + then + ${ECHO} "# ${net}" + ${CAT} ${net} + ${ECHO} + fi + done +} + +#------------------------------------------------------------------------------- +# MAIN Program: +#------------------------------------------------------------------------------- +# Parse input options +tools_parse_options "${@}" + +# Set affinity of current script +CPULIST="" +set_affinity ${CPULIST} + +LOG "collecting ${TOOLNAME} for ${PERIOD_MIN} minutes, with ${INTERVAL_SEC} second sample intervals." + +# Print tools generic tools header +tools_header + +# Calculate number of sample repeats based on overall interval and sampling interval +((REPEATS = PERIOD_MIN * 60 / INTERVAL_SEC)) + +for ((rep=1; rep <= REPEATS ; rep++)) +do + print_netcmds + sleep ${INTERVAL_SEC} +done +print_netcmds +LOG "done" + +# normal program exit +tools_cleanup 0 +exit 0 diff --git a/tools/engtools/hostdata-collectors/scripts/postgres.sh b/tools/engtools/hostdata-collectors/scripts/postgres.sh new file mode 100644 index 000000000..9bcf8d1f3 --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/postgres.sh @@ -0,0 +1,141 @@ +#!/bin/bash +# Usage: postgres.sh [-p ] [-i ] [-c ] [-h] +TOOLBIN=$(dirname $0) + +# Initialize tools environment variables, and define common utility functions +. ${TOOLBIN}/engtools_util.sh +tools_init +if [ $? -ne 0 ]; then + echo "FATAL, tools_init - could not setup environment" + exit $? +fi + +# Enable use of INTERVAL_SEC sample interval +OPT_USE_INTERVALS=1 + +# Print key networking device statistics +function print_postgres() +{ + print_separator + TOOL_HIRES_TIME + + # postgressql command: set user, disable pagination, and be quiet + PSQL="sudo -u postgres psql --pset pager=off -q" + + # List postgres databases + db_list=( $(${PSQL} -t -c "SELECT datname FROM pg_database WHERE datistemplate = false;") ) + ${ECHO} "# postgres databases" + echo "db_list = ${db_list[@]}" + ${ECHO} + + # List sizes of all postgres databases (similar to "\l+") + ${ECHO} "# postgres database sizes" + ${PSQL} -c " +SELECT + pg_database.datname, + pg_database_size(pg_database.datname), + pg_size_pretty(pg_database_size(pg_database.datname)) +FROM pg_database +ORDER BY pg_database_size DESC; +" + + # For each database, list tables and their sizes (similar to "\dt+") + for db in "${db_list[@]}" + do + ${ECHO} "# postgres database: ${db}" + ${PSQL} -d ${db} -c " +SELECT + table_schema, + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size, + live_tuples, + dead_tuples +FROM ( + SELECT + table_schema, + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size, + pg_stat_get_live_tuples(table_name::regclass) AS live_tuples, + pg_stat_get_dead_tuples(table_name::regclass) AS dead_tuples + FROM ( + SELECT + table_schema, + table_name + FROM information_schema.tables + WHERE table_schema='public' + AND table_type='BASE TABLE' + ) AS all_tables + ORDER BY total_size DESC +) AS pretty_sizes; +" + + ${ECHO} "# postgres database vacuum: ${db}" + ${PSQL} -d ${db} -c " +SELECT + relname, + n_live_tup, + n_dead_tup, + last_vacuum, + last_autovacuum, + last_analyze, + last_autoanalyze +FROM pg_stat_user_tables; +" + done + + # Specific table counts (This is very SLOW, look at "live tuples" instead) + # Number of keystone tokens + #${ECHO} "# keystone token count" + + # Number of postgres connections + ${ECHO} "# postgres database connections" + CONN=$(ps -C postgres -o cmd= | wc -l) + CONN_T=$(ps -C postgres -o cmd= | awk '/postgres: / {print $3}' | awk '{for(i=1;i<=NF;i++) a[$i]++} END {for(k in a) print k, a[k]}' | sort -k 2 -nr ) + ${ECHO} "connections total = ${CONN}" + ${ECHO} + ${ECHO} "connections breakdown:" + ${ECHO} "${CONN_T}" + ${ECHO} + + ${ECHO} "connections breakdown (query):" + ${PSQL} -c "SELECT datname,state,count(*) from pg_stat_activity group by datname,state;" + ${ECHO} + + ${ECHO} "connections idle age:" + ${PSQL} -c "SELECT datname,age(now(),state_change) from pg_stat_activity where state='idle';" + ${ECHO} +} + +#------------------------------------------------------------------------------- +# MAIN Program: +#------------------------------------------------------------------------------- +# Parse input options +tools_parse_options "${@}" + +# Set affinity of current script +CPULIST="" +set_affinity ${CPULIST} + +LOG "collecting ${TOOLNAME} for ${PERIOD_MIN} minutes, with ${INTERVAL_SEC} second sample intervals." + +# Print tools generic tools header +tools_header + +# Calculate number of sample repeats based on overall interval and sampling interval +((REPEATS = PERIOD_MIN * 60 / INTERVAL_SEC)) + +for ((rep=1; rep <= REPEATS ; rep++)) +do + print_postgres + sleep ${INTERVAL_SEC} +done +print_postgres +LOG "done" + +# normal program exit +tools_cleanup 0 +exit 0 diff --git a/tools/engtools/hostdata-collectors/scripts/rabbitmq.sh b/tools/engtools/hostdata-collectors/scripts/rabbitmq.sh new file mode 100644 index 000000000..c588c16c5 --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/rabbitmq.sh @@ -0,0 +1,85 @@ +#!/bin/bash +# Usage: rabbitmq.sh [-p ] [-i ] [-c ] [-h] +TOOLBIN=$(dirname $0) + +# Initialize tools environment variables, and define common utility functions +. ${TOOLBIN}/engtools_util.sh +tools_init +if [ $? -ne 0 ]; then + echo "FATAL, tools_init - could not setup environment" + exit $? +fi + +# Enable use of INTERVAL_SEC sample interval +OPT_USE_INTERVALS=1 +#Need this workaround +MQOPT="-n rabbit@localhost" +# Print key networking device statistics +function print_rabbitmq() +{ + print_separator + TOOL_HIRES_TIME + + # IMPORTANT: + # - Difficulty getting rabbitmqctl to work from init.d script; + # apparently it requires a psuedo-TTY, which is something you don't have + # until post-init. + # - WORKAROUND: run command using 'sudo', even if you are 'root' + + # Dump various rabbitmq related stats + MQ_STATUS="rabbitmqctl ${MQOPT} status" + ${ECHO} "# ${MQ_STATUS}" + sudo ${MQ_STATUS} | grep -e '{memory' -A30 + ${ECHO} + + # THe following is useful in diagnosing rabbit memory leaks + # when end-users do not drain their queues (eg, due to RPC timeout issues, etc) + MQ_QUEUES="rabbitmqctl ${MQOPT} list_queues messages name pid messages_ready messages_unacknowledged memory consumers" + ${ECHO} "# ${MQ_QUEUES}" + sudo ${MQ_QUEUES} + ${ECHO} + + num_queues=$(sudo rabbitmqctl ${MQOPT} list_queues | wc -l); ((num_queues-=2)) + num_bindings=$(sudo rabbitmqctl ${MQOPT} list_bindings | wc -l); ((num_bindings-=2)) + num_exchanges=$(sudo rabbitmqctl ${MQOPT} list_exchanges | wc -l); ((num_exchanges-=2)) + num_connections=$(sudo rabbitmqctl ${MQOPT} list_connections | wc -l); ((num_connections-=2)) + num_channels=$(sudo rabbitmqctl ${MQOPT} list_channels | wc -l); ((num_channels-=2)) + arr=($(sudo rabbitmqctl ${MQOPT} list_queues messages consumers memory | \ + awk '/^[0-9]/ {a+=$1; b+=$2; c+=$3} END {print a, b, c}')) + messages=${arr[0]}; consumers=${arr[1]}; memory=${arr[2]} + printf "%6s %8s %9s %11s %8s %8s %9s %10s\n" \ + "queues" "bindings" "exchanges" "connections" "channels" "messages" "consumers" "memory" + printf "%6d %8d %9d %11d %8d %8d %9d %10d\n" \ + $num_queues $num_bindings $num_exchanges $num_connections $num_channels $messages $consumers $memory + ${ECHO} +} + +#------------------------------------------------------------------------------- +# MAIN Program: +#------------------------------------------------------------------------------- +# Parse input options +tools_parse_options "${@}" + +# Set affinity of current script +CPULIST="" +set_affinity ${CPULIST} + +LOG "collecting ${TOOLNAME} for ${PERIOD_MIN} minutes, with ${INTERVAL_SEC} second sample intervals." + +# Print tools generic tools header +tools_header + +# Calculate number of sample repeats based on overall interval and sampling interval +((REPEATS = PERIOD_MIN * 60 / INTERVAL_SEC)) + +for ((rep=1; rep <= REPEATS ; rep++)) +do + print_rabbitmq + sleep ${INTERVAL_SEC} +done +print_rabbitmq +LOG "done" + +# normal program exit +tools_cleanup 0 +exit 0 diff --git a/tools/engtools/hostdata-collectors/scripts/remote/rbzip2-engtools.sh b/tools/engtools/hostdata-collectors/scripts/remote/rbzip2-engtools.sh new file mode 100644 index 000000000..3d972c477 --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/remote/rbzip2-engtools.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Purpose: +# bzip2 compress engtools data on all nodes. + +# Define common utility functions +TOOLBIN=$(dirname $0) +. ${TOOLBIN}/engtools_util.sh +if [ $UID -eq 0 ]; then + ERRLOG "Do not start $0 using sudo/root access." + exit 1 +fi + +# environment for system commands +source /etc/nova/openrc + +declare -a CONTROLLER +declare -a COMPUTE +declare -a STORAGE +CONTROLLER=( $(system host-list | awk '(/controller/) {print $4;}') ) +COMPUTE=( $(system host-list | awk '(/compute/) {print $4;}') ) +STORAGE=( $(system host-list | awk '(/storage/) {print $4;}') ) + +LOG "Remote bzip2 engtools data on all blades:" +for blade in ${CONTROLLER[@]}; do + ping -c1 ${blade} 1>/dev/null 2>/dev/null + if [ $? -eq 0 ]; then + LOG "bzip2 on $blade:" + ssh -q -t -o StrictHostKeyChecking=no \ + ${blade} sudo bzip2 /scratch/syseng_data/${blade}/* + else + WARNLOG "cannot ping: ${blade}" + fi +done +for blade in ${STORAGE[@]} ${COMPUTE[@]} ; do + ping -c1 ${blade} 1>/dev/null 2>/dev/null + if [ $? -eq 0 ]; then + LOG "bzip2 on $blade:" + ssh -q -t -o StrictHostKeyChecking=no \ + ${blade} sudo bzip2 /tmp/syseng_data/${blade}/* + else + WARNLOG "cannot ping: ${blade}" + fi +done +LOG "done" + +exit 0 diff --git a/tools/engtools/hostdata-collectors/scripts/remote/rstart-engtools.sh b/tools/engtools/hostdata-collectors/scripts/remote/rstart-engtools.sh new file mode 100644 index 000000000..f3df76d55 --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/remote/rstart-engtools.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Purpose: +# Remote start engtools on all blades. + +# Define common utility functions +TOOLBIN=$(dirname $0) +. ${TOOLBIN}/engtools_util.sh +if [ $UID -eq 0 ]; then + ERRLOG "Do not start $0 using sudo/root access." + exit 1 +fi + +# environment for system commands +source /etc/nova/openrc + +declare -a BLADES +BLADES=( $(system host-list | awk '(/compute|controller|storage/) {print $4;}') ) + +LOG "Remote start engtools on all blades:" +for blade in ${BLADES[@]}; do + if [ "${blade}" == "${HOSTNAME}" ]; then + LOG "start on $blade:" + sudo service collect-engtools.sh start + else + ping -c1 ${blade} 1>/dev/null 2>/dev/null + if [ $? -eq 0 ]; then + LOG "start on $blade:" + ssh -q -t -o StrictHostKeyChecking=no \ + ${blade} sudo service collect-engtools.sh start + else + WARNLOG "cannot ping: ${blade}" + fi + fi +done +LOG "done" + +exit 0 diff --git a/tools/engtools/hostdata-collectors/scripts/remote/rstop-engtools.sh b/tools/engtools/hostdata-collectors/scripts/remote/rstop-engtools.sh new file mode 100644 index 000000000..1251ea827 --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/remote/rstop-engtools.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Purpose: +# Remote stop engtools on all blades. + +# Define common utility functions +TOOLBIN=$(dirname $0) +. ${TOOLBIN}/engtools_util.sh +if [ $UID -eq 0 ]; then + ERRLOG "Do not start $0 using sudo/root access." + exit 1 +fi + +# environment for system commands +source /etc/nova/openrc + +declare -a BLADES +BLADES=( $(system host-list | awk '(/compute|controller|storage/) {print $4;}') ) + +LOG "Remote stop engtools on all blades:" +for blade in ${BLADES[@]}; do + if [ "${blade}" == "${HOSTNAME}" ]; then + LOG "stop on $blade:" + sudo service collect-engtools.sh stop + else + ping -c1 ${blade} 1>/dev/null 2>/dev/null + if [ $? -eq 0 ]; then + LOG "stop on $blade:" + ssh -q -t -o StrictHostKeyChecking=no \ + ${blade} sudo service collect-engtools.sh stop + else + WARNLOG "cannot ping: ${blade}" + fi + fi +done +LOG "done" + +exit 0 diff --git a/tools/engtools/hostdata-collectors/scripts/remote/rsync-engtools-data.sh b/tools/engtools/hostdata-collectors/scripts/remote/rsync-engtools-data.sh new file mode 100644 index 000000000..6f82f47e7 --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/remote/rsync-engtools-data.sh @@ -0,0 +1,70 @@ +#!/bin/bash +# Purpose: +# rsync data from all nodes to backup location. + +# Define common utility functions +TOOLBIN=$(dirname $0) +. ${TOOLBIN}/engtools_util.sh +if [ $UID -eq 0 ]; then + ERRLOG "Do not start $0 using sudo/root access." + exit 1 +fi + +# environment for system commands +source /etc/nova/openrc + +declare -a BLADES +declare -a CONTROLLER +declare -a STORAGE +declare -a COMPUTE +BLADES=( $(system host-list | awk '(/compute|controller|storage/) {print $4;}') ) +CONTROLLER=( $(system host-list | awk '(/controller/) {print $4;}') ) +COMPUTE=( $(system host-list | awk '(/compute/) {print $4;}') ) +STORAGE=( $(system host-list | awk '(/storage/) {print $4;}') ) + +DEST=/opt/backups/syseng_data/ +if [[ "${HOSTNAME}" =~ "controller-" ]]; then + LOG "rsync DEST=${DEST}" +else + LOG "*ERROR* only run this on controller" + exit 1 +fi +sudo mkdir -p ${DEST} + +# rsync options +USER=wrsroot +RSYNC_OPT="-r -l --safe-links -h -P --stats --exclude=*.pyc" + +# Rsync data from multiple locations +LOG "rsync engtools data from all blades:" + +# controllers +SRC=/scratch/syseng_data/ +DEST=/opt/backups/syseng_data/ +for HOST in ${CONTROLLER[@]} +do + ping -c1 ${HOST} 1>/dev/null 2>/dev/null + if [ $? -eq 0 ]; then + LOG "rsync ${RSYNC_OPT} ${USER}@${HOST}:${SRC} ${DEST}" + sudo rsync ${RSYNC_OPT} ${USER}@${HOST}:${SRC} ${DEST} + else + WARNLOG "cannot ping: ${HOST}" + fi +done + +# computes & storage +SRC=/tmp/syseng_data/ +DEST=/opt/backups/syseng_data/ +for HOST in ${STORAGE[@]} ${COMPUTE[@]} +do + ping -c1 ${HOST} 1>/dev/null 2>/dev/null + if [ $? -eq 0 ]; then + LOG "rsync ${RSYNC_OPT} ${USER}@${HOST}:${SRC} ${DEST}" + sudo rsync ${RSYNC_OPT} ${USER}@${HOST}:${SRC} ${DEST} + else + WARNLOG "cannot ping: ${HOST}" + fi +done +LOG 'done' + +exit 0 diff --git a/tools/engtools/hostdata-collectors/scripts/slab.sh b/tools/engtools/hostdata-collectors/scripts/slab.sh new file mode 100644 index 000000000..70e9c0596 --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/slab.sh @@ -0,0 +1,23 @@ +#!/bin/bash +PAGE_SIZE=$(getconf PAGE_SIZE) +cat /proc/slabinfo | awk -v page_size_B=${PAGE_SIZE} ' +BEGIN {page_KiB = page_size_B/1024; TOT_KiB = 0;} +(NF == 17) { + gsub(/[<>]/, ""); + printf("%-22s %11s %8s %8s %10s %12s %1s %5s %10s %12s %1s %12s %9s %11s %8s\n", + $2, $3, $4, $5, $6, $7, $8, $10, $11, $12, $13, $15, $16, $17, "KiB"); +} +(NF == 16) { + num_objs=$3; obj_per_slab=$5; pages_per_slab=$6; + KiB = (obj_per_slab > 0) ? page_KiB*num_objs/obj_per_slab*pages_per_slab : 0; + TOT_KiB += KiB; + printf("%-22s %11d %8d %8d %10d %12d %1s %5d %10d %12d %1s %12d %9d %11d %8d\n", + $1, $2, $3, $4, $5, $6, $7, $9, $10, $11, $12, $14, $15, $16, KiB); +} +END { + printf("%-22s %11s %8s %8s %10s %12s %1s %5s %10s %12s %1s %12s %9s %11s %8d\n", + "TOTAL", "-", "-", "-", "-", "-", ":", "-", "-", "-", ":", "-", "-", "-", TOT_KiB); +} +' 2>/dev/null + +exit 0 diff --git a/tools/engtools/hostdata-collectors/scripts/ticker.sh b/tools/engtools/hostdata-collectors/scripts/ticker.sh new file mode 100644 index 000000000..570cd526d --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/ticker.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# Usage: ticker.sh [-p ] [-i ] [-c ] [-h] +TOOLBIN=$(dirname $0) + +# Initialize tools environment variables, and define common utility functions +. ${TOOLBIN}/engtools_util.sh +tools_init +if [ $? -ne 0 ]; then + echo "FATAL, tools_init - could not setup environment" + exit $? +fi + +# Enable use of INTERVAL_SEC sample interval +OPT_USE_INTERVALS=1 + +#------------------------------------------------------------------------------- +# MAIN Program: +#------------------------------------------------------------------------------- +# Parse input options +tools_parse_options "${@}" + +# Set affinity of current script +CPULIST="" +set_affinity ${CPULIST} + +# Calculate number of sample repeats based on overall interval and sampling interval +((REPEATS = PERIOD_MIN * 60 / INTERVAL_SEC)) +((REP_LOG = 10 * 60 / INTERVAL_SEC)) + +LOG_NOCR "collecting " +t=0 +for ((rep=1; rep <= REPEATS ; rep++)) +do + ((t++)) + sleep ${INTERVAL_SEC} + if [ ${t} -ge ${REP_LOG} ]; then + t=0 + echo "." + LOG_NOCR "collecting " + else + echo -n "." + fi +done +echo "." + +LOG "done" + +# normal program exit +tools_cleanup 0 +exit 0 diff --git a/tools/engtools/hostdata-collectors/scripts/top.sh b/tools/engtools/hostdata-collectors/scripts/top.sh new file mode 100644 index 000000000..45dff33d7 --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/top.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# Usage: top.sh [-p ] [-i ] [-c ] [-h] +TOOLBIN=$(dirname $0) + +# Initialize tools environment variables, and define common utility functions +. ${TOOLBIN}/engtools_util.sh +tools_init +if [ $? -ne 0 ]; then + echo "FATAL, tools_init - could not setup environment" + exit $? +fi + +# Enable use of INTERVAL_SEC sample interval +OPT_USE_INTERVALS=1 + +#------------------------------------------------------------------------------- +# MAIN Program: +#------------------------------------------------------------------------------- +# Parse input options +tools_parse_options "${@}" + +# Set affinity of current script +CPULIST="" +set_affinity ${CPULIST} + +LOG "collecting ${TOOLNAME} for ${PERIOD_MIN} minutes, with ${INTERVAL_SEC} second sample intervals." + +# Print tools generic tools header +tools_header + +# Calculate number of sample repeats based on overall interval and sampling interval +((REPEATS = PERIOD_MIN * 60 / INTERVAL_SEC)) +((REP = REPEATS + 1)) + +# Execute tool for specified duration +CMD="top -b -c -H -n ${REP} -d ${INTERVAL_SEC}" +#LOG "CMD: ${CMD}" +${CMD} +LOG "done" + +# normal program exit +tools_cleanup 0 +exit 0 diff --git a/tools/engtools/hostdata-collectors/scripts/vswitch.sh b/tools/engtools/hostdata-collectors/scripts/vswitch.sh new file mode 100644 index 000000000..dae8bf76a --- /dev/null +++ b/tools/engtools/hostdata-collectors/scripts/vswitch.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# Usage: vswitch.sh [-p ] [-i ] [-c ] [-h] +TOOLBIN=$(dirname $0) + +# Initialize tools environment variables, and define common utility functions +. ${TOOLBIN}/engtools_util.sh +tools_init +if [ $? -ne 0 ]; then + echo "FATAL, tools_init - could not setup environment" + exit $? +fi + +# Enable use of INTERVAL_SEC sample interval +OPT_USE_INTERVALS=1 + +# Print key networking device statistics +function print_vswitch() +{ + print_separator + TOOL_HIRES_TIME + + cmd='vshell engine-list' + ${ECHO} "# ${cmd}" ; ${cmd} ; ${ECHO} + cmd='vshell engine-stats-list' + ${ECHO} "# ${cmd}" ; ${cmd} ; ${ECHO} + cmd='vshell port-list' + ${ECHO} "# ${cmd}" ; ${cmd} ; ${ECHO} + cmd='vshell port-stats-list' + ${ECHO} "# ${cmd}" ; ${cmd} ; ${ECHO} + cmd='vshell network-list' + ${ECHO} "# ${cmd}" ; ${cmd} ; ${ECHO} + cmd='vshell network-stats-list' + ${ECHO} "# ${cmd}" ; ${cmd} ; ${ECHO} + cmd='vshell interface-list' + ${ECHO} "# ${cmd}" ; ${cmd} ; ${ECHO} + cmd='vshell interface-stats-list' + ${ECHO} "# ${cmd}" ; ${cmd} ; ${ECHO} +} + +#------------------------------------------------------------------------------- +# MAIN Program: +#------------------------------------------------------------------------------- +# Parse input options +tools_parse_options "${@}" + +# Set affinity of current script +CPULIST="" +set_affinity ${CPULIST} + +LOG "collecting ${TOOLNAME} for ${PERIOD_MIN} minutes, with ${INTERVAL_SEC} second sample intervals." + +# Print tools generic tools header +tools_header + +# Calculate number of sample repeats based on overall interval and sampling interval +((REPEATS = PERIOD_MIN * 60 / INTERVAL_SEC)) + +for ((rep=1; rep <= REPEATS ; rep++)) +do + print_vswitch + sleep ${INTERVAL_SEC} +done +print_vswitch +LOG "done" + +# normal program exit +tools_cleanup 0 +exit 0