From cf0327fedb230af85ad24f9e598394552356af19 Mon Sep 17 00:00:00 2001 From: akrzos Date: Wed, 12 Jul 2017 10:06:54 -0400 Subject: [PATCH] Update the collectd plugins to be more robust. * Gnocchi status catches exceptions to avoid collectd backoff * Rabbitmq plugin catches exceptions when a queue does not exist Change-Id: If49dce4a332d3f8c3048a3b80389cb2bf61c1348 --- .../files/collectd_gnocchi_status.py | 53 +++++++++------- .../files/collectd_rabbitmq_monitoring.py | 61 ++++++++++++++----- 2 files changed, 77 insertions(+), 37 deletions(-) diff --git a/ansible/install/roles/collectd-openstack/files/collectd_gnocchi_status.py b/ansible/install/roles/collectd-openstack/files/collectd_gnocchi_status.py index e282c634f..d37d3e885 100644 --- a/ansible/install/roles/collectd-openstack/files/collectd_gnocchi_status.py +++ b/ansible/install/roles/collectd-openstack/files/collectd_gnocchi_status.py @@ -29,37 +29,43 @@ def configure(configobj): collectd.info('gnocchi_status: Interval: {}'.format(INTERVAL)) collectd.register_read(read, INTERVAL) + def read(data=None): starttime = time.time() gnocchi = client.Client(session=keystone_session) - status = gnocchi.status.get() + try: + status = gnocchi.status.get() + metric = collectd.Values() + metric.plugin = 'gnocchi_status' + metric.interval = INTERVAL + metric.type = 'gauge' + metric.type_instance = 'measures' + metric.values = [status['storage']['summary']['measures']] + metric.dispatch() - metric = collectd.Values() - metric.plugin = 'gnocchi_status' - metric.interval = INTERVAL - metric.type = 'gauge' - metric.type_instance = 'measures' - metric.values = [status['storage']['summary']['measures']] - metric.dispatch() - - metric = collectd.Values() - metric.plugin = 'gnocchi_status' - metric.interval = INTERVAL - metric.type = 'gauge' - metric.type_instance = 'metrics' - metric.values = [status['storage']['summary']['metrics']] - metric.dispatch() + metric = collectd.Values() + metric.plugin = 'gnocchi_status' + metric.interval = INTERVAL + metric.type = 'gauge' + metric.type_instance = 'metrics' + metric.values = [status['storage']['summary']['metrics']] + metric.dispatch() + except Exception as err: + collectd.error( + 'gnocchi_status: Exception getting status: {}' + .format(err)) timediff = time.time() - starttime if timediff > INTERVAL: - collectd.warning('gnocchi_status: Took: {} > {}'.format(round(timediff, 2), - INTERVAL)) + collectd.warning( + 'gnocchi_status: Took: {} > {}' + .format(round(timediff, 2), INTERVAL)) + def create_keystone_session(): - auth = v2.Password(username=os_username, - password=os_password, - tenant_name=os_tenant, + auth = v2.Password( + username=os_username, password=os_password, tenant_name=os_tenant, auth_url=os_auth_url) return session.Session(auth=auth) @@ -67,10 +73,11 @@ os_username = os.environ.get('OS_USERNAME') os_password = os.environ.get('OS_PASSWORD') os_tenant = os.environ.get('OS_TENANT_NAME') if os_tenant is None: - os_tenant = os.environ.get('OS_PROJECT_NAME') + os_tenant = os.environ.get('OS_PROJECT_NAME') os_auth_url = os.environ.get('OS_AUTH_URL') -collectd.info('gnocchi_status: Connecting with user={}, password={}, tenant/project={}, ' +collectd.info( + 'gnocchi_status: Connecting with user={}, password={}, tenant={}, ' 'auth_url={}'.format(os_username, os_password, os_tenant, os_auth_url)) keystone_session = create_keystone_session() diff --git a/ansible/install/roles/collectd-openstack/files/collectd_rabbitmq_monitoring.py b/ansible/install/roles/collectd-openstack/files/collectd_rabbitmq_monitoring.py index 1465dd474..010c1d5c5 100644 --- a/ansible/install/roles/collectd-openstack/files/collectd_rabbitmq_monitoring.py +++ b/ansible/install/roles/collectd-openstack/files/collectd_rabbitmq_monitoring.py @@ -10,12 +10,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Collectd python plugin to read rabbitmq metrics from rabbitmq management plugin.""" +"""Collectd python plugin to read rabbitmq metrics from rabbitmq management +plugin. +""" from pyrabbit.api import Client +from pyrabbit.http import HTTPError import collectd import os import time + def configure(configobj): global INTERVAL global cl @@ -27,20 +31,28 @@ def configure(configobj): port = int(config['port'][0]) username = config['username'][0] password = config['password'][0] - queues_to_count = config['message_count'] + queues_to_count = [] + if 'message_count' in config: + queues_to_count = config['message_count'] collectd.info('rabbitmq_monitoring: Interval: {}'.format(INTERVAL)) cl = Client('{}:{}'.format(host, port), username, password) - collectd.info('rabbitmq_monitoring: Connecting to: {}:{} as user:{} password:{}'.format(host, port, username, password)) - collectd.info('rabbitmq_monitoring: Counting messages on: {}'.format(queues_to_count)) + collectd.info( + 'rabbitmq_monitoring: Connecting to: {}:{} as user:{} password:{}' + .format(host, port, username, password)) + collectd.info( + 'rabbitmq_monitoring: Counting messages on: {}' + .format(queues_to_count)) collectd.register_read(read, INTERVAL) + def read(data=None): starttime = time.time() overview = cl.get_overview() # Object counts - for metric_instance in ['channels', 'connections', 'consumers', 'exchanges', 'queues']: + for metric_instance in \ + ['channels', 'connections', 'consumers', 'exchanges', 'queues']: metric = collectd.Values() metric.plugin = 'rabbitmq_monitoring' metric.interval = INTERVAL @@ -50,7 +62,8 @@ def read(data=None): metric.dispatch() # Aggregated Queue message stats - for metric_instance in ['messages', 'messages_ready', 'messages_unacknowledged']: + for metric_instance in \ + ['messages', 'messages_ready', 'messages_unacknowledged']: metric = collectd.Values() metric.plugin = 'rabbitmq_monitoring' metric.interval = INTERVAL @@ -64,13 +77,20 @@ def read(data=None): metric.interval = INTERVAL metric.type = 'gauge' metric.type_instance = 'queue_total-{}-rate'.format(metric_instance) - metric.values = [overview['queue_totals']['{}_details'.format(metric_instance)]['rate']] + metric.values = \ + [ + overview['queue_totals']['{}_details'.format(metric_instance)] + ['rate'] + ] metric.dispatch() # Aggregated Message Stats - for metric_instance in ['ack', 'confirm', 'deliver', 'deliver_get', 'deliver_no_ack', 'get', - 'get_no_ack', 'publish', 'publish_in', 'publish_out', 'redeliver', - 'return_unroutable']: + for metric_instance in \ + [ + 'ack', 'confirm', 'deliver', 'deliver_get', 'deliver_no_ack', + 'get', 'get_no_ack', 'publish', 'publish_in', 'publish_out', + 'redeliver', 'return_unroutable' + ]: metric = collectd.Values() metric.plugin = 'rabbitmq_monitoring' metric.interval = INTERVAL @@ -84,12 +104,22 @@ def read(data=None): metric.interval = INTERVAL metric.type = 'gauge' metric.type_instance = 'message_total-{}-rate'.format(metric_instance) - metric.values = [overview['message_stats']['{}_details'.format(metric_instance)]['rate']] + metric.values = \ + [ + overview['message_stats']['{}_details'.format(metric_instance)] + ['rate'] + ] metric.dispatch() # Configurable per-queue message counts for queue_name in queues_to_count: - messages_detail = cl.get_messages('/', queue_name) + messages_detail = None + try: + messages_detail = cl.get_messages('/', queue_name) + except HTTPError as err: + collectd.error( + 'Error Opening Queue [{}] details: {}' + .format(queue_name, err)) if messages_detail is None: count = 0 else: @@ -104,7 +134,10 @@ def read(data=None): timediff = time.time() - starttime if timediff > INTERVAL: - collectd.warning('rabbitmq_monitoring: Took: {} > {}'.format(round(timediff, 2), - INTERVAL)) + collectd.warning( + 'rabbitmq_monitoring: Took: {} > {}'.format( + round(timediff, 2), + INTERVAL) + ) collectd.register_config(configure)