Update the collectd plugins to be more robust.

* Gnocchi status catches exceptions to avoid collectd backoff
* Rabbitmq plugin catches exceptions when a queue does not exist

Change-Id: If49dce4a332d3f8c3048a3b80389cb2bf61c1348
This commit is contained in:
akrzos 2017-07-12 10:06:54 -04:00
parent 2960b7fa1e
commit cf0327fedb
2 changed files with 77 additions and 37 deletions

View File

@ -29,12 +29,13 @@ def configure(configobj):
collectd.info('gnocchi_status: Interval: {}'.format(INTERVAL)) collectd.info('gnocchi_status: Interval: {}'.format(INTERVAL))
collectd.register_read(read, INTERVAL) collectd.register_read(read, INTERVAL)
def read(data=None): def read(data=None):
starttime = time.time() starttime = time.time()
gnocchi = client.Client(session=keystone_session) gnocchi = client.Client(session=keystone_session)
try:
status = gnocchi.status.get() status = gnocchi.status.get()
metric = collectd.Values() metric = collectd.Values()
metric.plugin = 'gnocchi_status' metric.plugin = 'gnocchi_status'
metric.interval = INTERVAL metric.interval = INTERVAL
@ -50,16 +51,21 @@ def read(data=None):
metric.type_instance = 'metrics' metric.type_instance = 'metrics'
metric.values = [status['storage']['summary']['metrics']] metric.values = [status['storage']['summary']['metrics']]
metric.dispatch() metric.dispatch()
except Exception as err:
collectd.error(
'gnocchi_status: Exception getting status: {}'
.format(err))
timediff = time.time() - starttime timediff = time.time() - starttime
if timediff > INTERVAL: if timediff > INTERVAL:
collectd.warning('gnocchi_status: Took: {} > {}'.format(round(timediff, 2), collectd.warning(
INTERVAL)) 'gnocchi_status: Took: {} > {}'
.format(round(timediff, 2), INTERVAL))
def create_keystone_session(): def create_keystone_session():
auth = v2.Password(username=os_username, auth = v2.Password(
password=os_password, username=os_username, password=os_password, tenant_name=os_tenant,
tenant_name=os_tenant,
auth_url=os_auth_url) auth_url=os_auth_url)
return session.Session(auth=auth) return session.Session(auth=auth)
@ -70,7 +76,8 @@ if os_tenant is None:
os_tenant = os.environ.get('OS_PROJECT_NAME') os_tenant = os.environ.get('OS_PROJECT_NAME')
os_auth_url = os.environ.get('OS_AUTH_URL') os_auth_url = os.environ.get('OS_AUTH_URL')
collectd.info('gnocchi_status: Connecting with user={}, password={}, tenant/project={}, ' collectd.info(
'gnocchi_status: Connecting with user={}, password={}, tenant={}, '
'auth_url={}'.format(os_username, os_password, os_tenant, os_auth_url)) 'auth_url={}'.format(os_username, os_password, os_tenant, os_auth_url))
keystone_session = create_keystone_session() keystone_session = create_keystone_session()

View File

@ -10,12 +10,16 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Collectd python plugin to read rabbitmq metrics from rabbitmq management plugin.""" """Collectd python plugin to read rabbitmq metrics from rabbitmq management
plugin.
"""
from pyrabbit.api import Client from pyrabbit.api import Client
from pyrabbit.http import HTTPError
import collectd import collectd
import os import os
import time import time
def configure(configobj): def configure(configobj):
global INTERVAL global INTERVAL
global cl global cl
@ -27,20 +31,28 @@ def configure(configobj):
port = int(config['port'][0]) port = int(config['port'][0])
username = config['username'][0] username = config['username'][0]
password = config['password'][0] password = config['password'][0]
queues_to_count = []
if 'message_count' in config:
queues_to_count = config['message_count'] queues_to_count = config['message_count']
collectd.info('rabbitmq_monitoring: Interval: {}'.format(INTERVAL)) collectd.info('rabbitmq_monitoring: Interval: {}'.format(INTERVAL))
cl = Client('{}:{}'.format(host, port), username, password) cl = Client('{}:{}'.format(host, port), username, password)
collectd.info('rabbitmq_monitoring: Connecting to: {}:{} as user:{} password:{}'.format(host, port, username, password)) collectd.info(
collectd.info('rabbitmq_monitoring: Counting messages on: {}'.format(queues_to_count)) 'rabbitmq_monitoring: Connecting to: {}:{} as user:{} password:{}'
.format(host, port, username, password))
collectd.info(
'rabbitmq_monitoring: Counting messages on: {}'
.format(queues_to_count))
collectd.register_read(read, INTERVAL) collectd.register_read(read, INTERVAL)
def read(data=None): def read(data=None):
starttime = time.time() starttime = time.time()
overview = cl.get_overview() overview = cl.get_overview()
# Object counts # Object counts
for metric_instance in ['channels', 'connections', 'consumers', 'exchanges', 'queues']: for metric_instance in \
['channels', 'connections', 'consumers', 'exchanges', 'queues']:
metric = collectd.Values() metric = collectd.Values()
metric.plugin = 'rabbitmq_monitoring' metric.plugin = 'rabbitmq_monitoring'
metric.interval = INTERVAL metric.interval = INTERVAL
@ -50,7 +62,8 @@ def read(data=None):
metric.dispatch() metric.dispatch()
# Aggregated Queue message stats # Aggregated Queue message stats
for metric_instance in ['messages', 'messages_ready', 'messages_unacknowledged']: for metric_instance in \
['messages', 'messages_ready', 'messages_unacknowledged']:
metric = collectd.Values() metric = collectd.Values()
metric.plugin = 'rabbitmq_monitoring' metric.plugin = 'rabbitmq_monitoring'
metric.interval = INTERVAL metric.interval = INTERVAL
@ -64,13 +77,20 @@ def read(data=None):
metric.interval = INTERVAL metric.interval = INTERVAL
metric.type = 'gauge' metric.type = 'gauge'
metric.type_instance = 'queue_total-{}-rate'.format(metric_instance) metric.type_instance = 'queue_total-{}-rate'.format(metric_instance)
metric.values = [overview['queue_totals']['{}_details'.format(metric_instance)]['rate']] metric.values = \
[
overview['queue_totals']['{}_details'.format(metric_instance)]
['rate']
]
metric.dispatch() metric.dispatch()
# Aggregated Message Stats # Aggregated Message Stats
for metric_instance in ['ack', 'confirm', 'deliver', 'deliver_get', 'deliver_no_ack', 'get', for metric_instance in \
'get_no_ack', 'publish', 'publish_in', 'publish_out', 'redeliver', [
'return_unroutable']: 'ack', 'confirm', 'deliver', 'deliver_get', 'deliver_no_ack',
'get', 'get_no_ack', 'publish', 'publish_in', 'publish_out',
'redeliver', 'return_unroutable'
]:
metric = collectd.Values() metric = collectd.Values()
metric.plugin = 'rabbitmq_monitoring' metric.plugin = 'rabbitmq_monitoring'
metric.interval = INTERVAL metric.interval = INTERVAL
@ -84,12 +104,22 @@ def read(data=None):
metric.interval = INTERVAL metric.interval = INTERVAL
metric.type = 'gauge' metric.type = 'gauge'
metric.type_instance = 'message_total-{}-rate'.format(metric_instance) metric.type_instance = 'message_total-{}-rate'.format(metric_instance)
metric.values = [overview['message_stats']['{}_details'.format(metric_instance)]['rate']] metric.values = \
[
overview['message_stats']['{}_details'.format(metric_instance)]
['rate']
]
metric.dispatch() metric.dispatch()
# Configurable per-queue message counts # Configurable per-queue message counts
for queue_name in queues_to_count: for queue_name in queues_to_count:
messages_detail = None
try:
messages_detail = cl.get_messages('/', queue_name) messages_detail = cl.get_messages('/', queue_name)
except HTTPError as err:
collectd.error(
'Error Opening Queue [{}] details: {}'
.format(queue_name, err))
if messages_detail is None: if messages_detail is None:
count = 0 count = 0
else: else:
@ -104,7 +134,10 @@ def read(data=None):
timediff = time.time() - starttime timediff = time.time() - starttime
if timediff > INTERVAL: if timediff > INTERVAL:
collectd.warning('rabbitmq_monitoring: Took: {} > {}'.format(round(timediff, 2), collectd.warning(
INTERVAL)) 'rabbitmq_monitoring: Took: {} > {}'.format(
round(timediff, 2),
INTERVAL)
)
collectd.register_config(configure) collectd.register_config(configure)