Update the collectd plugins to be more robust.

* Gnocchi status catches exceptions to avoid collectd backoff
* Rabbitmq plugin catches exceptions when a queue does not exist

Change-Id: If49dce4a332d3f8c3048a3b80389cb2bf61c1348
This commit is contained in:
akrzos 2017-07-12 10:06:54 -04:00
parent 2960b7fa1e
commit cf0327fedb
2 changed files with 77 additions and 37 deletions

View File

@ -29,37 +29,43 @@ def configure(configobj):
collectd.info('gnocchi_status: Interval: {}'.format(INTERVAL))
collectd.register_read(read, INTERVAL)
def read(data=None):
starttime = time.time()
gnocchi = client.Client(session=keystone_session)
status = gnocchi.status.get()
try:
status = gnocchi.status.get()
metric = collectd.Values()
metric.plugin = 'gnocchi_status'
metric.interval = INTERVAL
metric.type = 'gauge'
metric.type_instance = 'measures'
metric.values = [status['storage']['summary']['measures']]
metric.dispatch()
metric = collectd.Values()
metric.plugin = 'gnocchi_status'
metric.interval = INTERVAL
metric.type = 'gauge'
metric.type_instance = 'measures'
metric.values = [status['storage']['summary']['measures']]
metric.dispatch()
metric = collectd.Values()
metric.plugin = 'gnocchi_status'
metric.interval = INTERVAL
metric.type = 'gauge'
metric.type_instance = 'metrics'
metric.values = [status['storage']['summary']['metrics']]
metric.dispatch()
metric = collectd.Values()
metric.plugin = 'gnocchi_status'
metric.interval = INTERVAL
metric.type = 'gauge'
metric.type_instance = 'metrics'
metric.values = [status['storage']['summary']['metrics']]
metric.dispatch()
except Exception as err:
collectd.error(
'gnocchi_status: Exception getting status: {}'
.format(err))
timediff = time.time() - starttime
if timediff > INTERVAL:
collectd.warning('gnocchi_status: Took: {} > {}'.format(round(timediff, 2),
INTERVAL))
collectd.warning(
'gnocchi_status: Took: {} > {}'
.format(round(timediff, 2), INTERVAL))
def create_keystone_session():
auth = v2.Password(username=os_username,
password=os_password,
tenant_name=os_tenant,
auth = v2.Password(
username=os_username, password=os_password, tenant_name=os_tenant,
auth_url=os_auth_url)
return session.Session(auth=auth)
@ -67,10 +73,11 @@ os_username = os.environ.get('OS_USERNAME')
os_password = os.environ.get('OS_PASSWORD')
os_tenant = os.environ.get('OS_TENANT_NAME')
if os_tenant is None:
os_tenant = os.environ.get('OS_PROJECT_NAME')
os_tenant = os.environ.get('OS_PROJECT_NAME')
os_auth_url = os.environ.get('OS_AUTH_URL')
collectd.info('gnocchi_status: Connecting with user={}, password={}, tenant/project={}, '
collectd.info(
'gnocchi_status: Connecting with user={}, password={}, tenant={}, '
'auth_url={}'.format(os_username, os_password, os_tenant, os_auth_url))
keystone_session = create_keystone_session()

View File

@ -10,12 +10,16 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Collectd python plugin to read rabbitmq metrics from rabbitmq management plugin."""
"""Collectd python plugin to read rabbitmq metrics from rabbitmq management
plugin.
"""
from pyrabbit.api import Client
from pyrabbit.http import HTTPError
import collectd
import os
import time
def configure(configobj):
global INTERVAL
global cl
@ -27,20 +31,28 @@ def configure(configobj):
port = int(config['port'][0])
username = config['username'][0]
password = config['password'][0]
queues_to_count = config['message_count']
queues_to_count = []
if 'message_count' in config:
queues_to_count = config['message_count']
collectd.info('rabbitmq_monitoring: Interval: {}'.format(INTERVAL))
cl = Client('{}:{}'.format(host, port), username, password)
collectd.info('rabbitmq_monitoring: Connecting to: {}:{} as user:{} password:{}'.format(host, port, username, password))
collectd.info('rabbitmq_monitoring: Counting messages on: {}'.format(queues_to_count))
collectd.info(
'rabbitmq_monitoring: Connecting to: {}:{} as user:{} password:{}'
.format(host, port, username, password))
collectd.info(
'rabbitmq_monitoring: Counting messages on: {}'
.format(queues_to_count))
collectd.register_read(read, INTERVAL)
def read(data=None):
starttime = time.time()
overview = cl.get_overview()
# Object counts
for metric_instance in ['channels', 'connections', 'consumers', 'exchanges', 'queues']:
for metric_instance in \
['channels', 'connections', 'consumers', 'exchanges', 'queues']:
metric = collectd.Values()
metric.plugin = 'rabbitmq_monitoring'
metric.interval = INTERVAL
@ -50,7 +62,8 @@ def read(data=None):
metric.dispatch()
# Aggregated Queue message stats
for metric_instance in ['messages', 'messages_ready', 'messages_unacknowledged']:
for metric_instance in \
['messages', 'messages_ready', 'messages_unacknowledged']:
metric = collectd.Values()
metric.plugin = 'rabbitmq_monitoring'
metric.interval = INTERVAL
@ -64,13 +77,20 @@ def read(data=None):
metric.interval = INTERVAL
metric.type = 'gauge'
metric.type_instance = 'queue_total-{}-rate'.format(metric_instance)
metric.values = [overview['queue_totals']['{}_details'.format(metric_instance)]['rate']]
metric.values = \
[
overview['queue_totals']['{}_details'.format(metric_instance)]
['rate']
]
metric.dispatch()
# Aggregated Message Stats
for metric_instance in ['ack', 'confirm', 'deliver', 'deliver_get', 'deliver_no_ack', 'get',
'get_no_ack', 'publish', 'publish_in', 'publish_out', 'redeliver',
'return_unroutable']:
for metric_instance in \
[
'ack', 'confirm', 'deliver', 'deliver_get', 'deliver_no_ack',
'get', 'get_no_ack', 'publish', 'publish_in', 'publish_out',
'redeliver', 'return_unroutable'
]:
metric = collectd.Values()
metric.plugin = 'rabbitmq_monitoring'
metric.interval = INTERVAL
@ -84,12 +104,22 @@ def read(data=None):
metric.interval = INTERVAL
metric.type = 'gauge'
metric.type_instance = 'message_total-{}-rate'.format(metric_instance)
metric.values = [overview['message_stats']['{}_details'.format(metric_instance)]['rate']]
metric.values = \
[
overview['message_stats']['{}_details'.format(metric_instance)]
['rate']
]
metric.dispatch()
# Configurable per-queue message counts
for queue_name in queues_to_count:
messages_detail = cl.get_messages('/', queue_name)
messages_detail = None
try:
messages_detail = cl.get_messages('/', queue_name)
except HTTPError as err:
collectd.error(
'Error Opening Queue [{}] details: {}'
.format(queue_name, err))
if messages_detail is None:
count = 0
else:
@ -104,7 +134,10 @@ def read(data=None):
timediff = time.time() - starttime
if timediff > INTERVAL:
collectd.warning('rabbitmq_monitoring: Took: {} > {}'.format(round(timediff, 2),
INTERVAL))
collectd.warning(
'rabbitmq_monitoring: Took: {} > {}'.format(
round(timediff, 2),
INTERVAL)
)
collectd.register_config(configure)