From 135612f0f9194441ae27b78413eae22e8295d368 Mon Sep 17 00:00:00 2001 From: Eoghan Glynn Date: Tue, 25 Jun 2013 22:27:48 +0100 Subject: [PATCH] Basic alarm threshold evaluation logic. Partially addresses BP alarm-distributed-threshold-evaluation. Threshold evaluation logic encapsulating basic alarm statistics querying, threshold comparison, and state transition rules. Change-Id: I0f3a50809985d25ab0eceb990b142da8701a9616 --- ceilometer/alarm/__init__.py | 0 ceilometer/alarm/threshold_evaluation.py | 219 +++++++++++++++++++++++ requirements.txt | 1 + tests/alarm/__init__.py | 0 tests/alarm/test_threshold_evaluation.py | 212 ++++++++++++++++++++++ 5 files changed, 432 insertions(+) create mode 100644 ceilometer/alarm/__init__.py create mode 100644 ceilometer/alarm/threshold_evaluation.py create mode 100644 tests/alarm/__init__.py create mode 100644 tests/alarm/test_threshold_evaluation.py diff --git a/ceilometer/alarm/__init__.py b/ceilometer/alarm/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ceilometer/alarm/threshold_evaluation.py b/ceilometer/alarm/threshold_evaluation.py new file mode 100644 index 000000000..38930b4fe --- /dev/null +++ b/ceilometer/alarm/threshold_evaluation.py @@ -0,0 +1,219 @@ +# -*- encoding: utf-8 -*- +# +# Copyright © 2013 Red Hat, Inc +# +# Author: Eoghan Glynn +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import datetime +import operator + +from oslo.config import cfg + +from ceilometer.openstack.common import log +from ceilometerclient import client as ceiloclient + +LOG = log.getLogger(__name__) + +COMPARATORS = { + 'gt': operator.gt, + 'lt': operator.lt, + 'ge': operator.ge, + 'le': operator.le, + 'eq': operator.eq, + 'ne': operator.ne, +} + +UNKNOWN = 'insufficient data' +OK = 'ok' +ALARM = 'alarm' + + +class Evaluator(object): + """This class implements the basic alarm threshold evaluation + logic. + """ + + # the sliding evaluation window is extended to allow + # for reporting/ingestion lag + look_back = 1 + + # minimum number of datapoints within sliding window to + # avoid unknown state + quorum = 1 + + def __init__(self, notifier): + self.alarms = [] + self.notifier = notifier + self.api_client = None + + def assign_alarms(self, alarms): + """Assign alarms to be evaluated.""" + self.alarms = alarms + + @property + def _client(self): + """Construct or reuse an authenticated API client.""" + if not self.api_client: + auth_config = cfg.CONF.service_credentials + creds = dict( + os_auth_url=auth_config.os_auth_url, + os_tenant_name=auth_config.os_tenant_name, + os_password=auth_config.os_password, + os_username=auth_config.os_username + ) + self.api_client = ceiloclient.get_client(2, **creds) + return self.api_client + + @staticmethod + def _constraints(alarm): + """Assert the constraints on the statistics query.""" + constraints = [] + for (field, value) in alarm.matching_metadata.iteritems(): + constraints.append(dict(field=field, op='eq', value=value)) + return constraints + + @classmethod + def _bound_duration(cls, alarm, constraints): + """Bound the duration of the statistics query.""" + now = datetime.datetime.utcnow() + window = (alarm.period * + (alarm.evaluation_periods + cls.look_back)) + start = now - datetime.timedelta(seconds=window) + LOG.debug(_('query stats from %(start)s to %(now)s') % locals()) + after = dict(field='timestamp', op='ge', value=start.isoformat()) + before = dict(field='timestamp', op='le', value=now.isoformat()) + constraints.extend([before, after]) + return constraints + + @staticmethod + def _sanitize(alarm, statistics): + """Sanitize statistics. + Ultimately this will be the hook for the exclusion of chaotic + datapoints for example. + """ + LOG.debug(_('sanitize stats %s') % statistics) + # in practice statistics are always sorted by period start, not + # strictly required by the API though + statistics = statistics[:alarm.evaluation_periods] + LOG.debug(_('pruned statistics to %d') % len(statistics)) + return statistics + + def _statistics(self, alarm, query): + """Retrieve statistics over the current window.""" + LOG.debug(_('stats query %s') % query) + try: + return self._client.statistics.list(alarm.counter_name, + q=query, + period=alarm.period) + except Exception: + LOG.exception(_('alarm stats retrieval failed')) + return [] + + def _update(self, alarm, state, reason): + """Refresh alarm state.""" + id = alarm.alarm_id + LOG.info(_('alarm %(id)s transitioning to %(state)s' + ' because %(reason)s') % locals()) + try: + self._client.alarms.update(id, **dict(state=state)) + alarm.state = state + if self.notifier: + self.notifier.notify(alarm, state, reason) + except Exception: + # retry will occur naturally on the next evaluation + # cycle (unless alarm state reverts in the meantime) + LOG.exception(_('alarm state update failed')) + + def _sufficient(self, alarm, statistics): + """Ensure there is sufficient data for evaluation, + transitioning to unknown otherwise. + """ + sufficient = len(statistics) >= self.quorum + if not sufficient and alarm.state != UNKNOWN: + reason = _('%d datapoints are unknown') % alarm.evaluation_periods + self._update(alarm, UNKNOWN, reason) + return sufficient + + @staticmethod + def _reason(alarm, statistics, distilled, state): + """Fabricate reason string.""" + count = len(statistics) + disposition = 'inside' if state == OK else 'outside' + last = getattr(statistics[-1], alarm.statistic) + return (_('Transition to %(state)s due to %(count)d samples' + ' %(disposition)s threshold, most recent: %(last)s') % + locals()) + + def _transition(self, alarm, statistics, compared): + """Transition alarm state if necessary. + + The transition rules are currently hardcoded as: + + - transitioning from a known state requires an unequivocal + set of datapoints + + - transitioning from unknown is on the basis of the most + recent datapoint if equivocal + + Ultimately this will be policy-driven. + """ + distilled = all(compared) + unequivocal = distilled or not any(compared) + if unequivocal: + state = ALARM if distilled else OK + if alarm.state != state: + reason = self._reason(alarm, statistics, distilled, state) + self._update(alarm, state, reason) + elif alarm.state == UNKNOWN: + state = ALARM if compared[-1] else OK + reason = self._reason(alarm, statistics, distilled, state) + self._update(alarm, state, reason) + + def evaluate(self): + """Evaluate the alarms assigned to this evaluator.""" + + LOG.info(_('initiating evaluation cycle on %d alarms') % + len(self.alarms)) + + for alarm in self.alarms: + + if not alarm.enabled: + LOG.debug(_('skipping alarm %s') % alarm.alarm_id) + continue + LOG.debug(_('evaluating alarm %s') % alarm.alarm_id) + + query = self._bound_duration( + alarm, + self._constraints(alarm) + ) + + statistics = self._sanitize( + alarm, + self._statistics(alarm, query) + ) + + if self._sufficient(alarm, statistics): + + def _compare(stat): + op = COMPARATORS[alarm.comparison_operator] + value = getattr(stat, alarm.statistic) + limit = alarm.threshold + LOG.debug(_('comparing value %(value)s against threshold' + ' %(limit)s') % locals()) + return op(value, limit) + + self._transition(alarm, + statistics, + list(map(_compare, statistics))) diff --git a/requirements.txt b/requirements.txt index 973853036..e94746de2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,6 +16,7 @@ msgpack-python python-glanceclient python-novaclient>=2.6.10 python-keystoneclient>=0.2,<0.3 +python-ceilometerclient>=1.0.1 python-swiftclient lxml requests>=1.1,<1.2.1 diff --git a/tests/alarm/__init__.py b/tests/alarm/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/alarm/test_threshold_evaluation.py b/tests/alarm/test_threshold_evaluation.py new file mode 100644 index 000000000..7fc77bcd5 --- /dev/null +++ b/tests/alarm/test_threshold_evaluation.py @@ -0,0 +1,212 @@ +# -*- encoding: utf-8 -*- +# +# Copyright © 2013 Red Hat, Inc +# +# Author: Eoghan Glynn +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +"""Tests for ceilometer/alarm/threshold_evaluation.py +""" +import mock +import uuid + +from ceilometer.alarm import threshold_evaluation +from ceilometer.storage import models +from ceilometer.tests import base +from ceilometerclient import exc +from ceilometerclient.v2 import statistics + + +class TestEvaluate(base.TestCase): + def setUp(self): + super(TestEvaluate, self).setUp() + self.api_client = mock.Mock() + self.notifier = mock.MagicMock() + self.alarms = [ + models.Alarm(name='instance_running_hot', + counter_name='cpu_util', + comparison_operator='gt', + threshold=80.0, + evaluation_periods=5, + statistic='avg', + user_id='foobar', + project_id='snafu', + period=60, + alarm_id=str(uuid.uuid4()), + matching_metadata={'resource_id': + 'my_instance'}), + models.Alarm(name='group_running_idle', + counter_name='cpu_util', + comparison_operator='le', + threshold=10.0, + statistic='max', + evaluation_periods=4, + user_id='foobar', + project_id='snafu', + period=300, + alarm_id=str(uuid.uuid4()), + matching_metadata={'metadata.user_metadata.AS': + 'my_group'}), + ] + self.evaluator = threshold_evaluation.Evaluator(self.notifier) + self.evaluator.assign_alarms(self.alarms) + + @staticmethod + def _get_stat(attr, value): + return statistics.Statistics(None, {attr: value}) + + def _set_all_alarms(self, state): + for alarm in self.alarms: + alarm.state = state + + def _assert_all_alarms(self, state): + for alarm in self.alarms: + self.assertEqual(alarm.state, state) + + def test_retry_transient_api_failure(self): + with mock.patch('ceilometerclient.client.get_client', + return_value=self.api_client): + broken = exc.CommunicationError(message='broken') + avgs = [self._get_stat('avg', self.alarms[0].threshold - v) + for v in xrange(5)] + maxs = [self._get_stat('max', self.alarms[1].threshold + v) + for v in xrange(1, 4)] + self.api_client.statistics.list.side_effect = [broken, + broken, + avgs, + maxs] + self.evaluator.evaluate() + self._assert_all_alarms('insufficient data') + self.evaluator.evaluate() + self._assert_all_alarms('ok') + + def test_simple_insufficient(self): + self._set_all_alarms('ok') + with mock.patch('ceilometerclient.client.get_client', + return_value=self.api_client): + self.api_client.statistics.list.return_value = [] + self.evaluator.evaluate() + self._assert_all_alarms('insufficient data') + expected = [mock.call(alarm.alarm_id, state='insufficient data') + for alarm in self.alarms] + update_calls = self.api_client.alarms.update.call_args_list + self.assertEqual(update_calls, expected) + expected = [mock.call(alarm, + 'insufficient data', + ('%d datapoints are unknown' % + alarm.evaluation_periods)) + for alarm in self.alarms] + self.assertEqual(self.notifier.notify.call_args_list, expected) + + def test_disabled_is_skipped(self): + self._set_all_alarms('ok') + self.alarms[1].enabled = False + with mock.patch('ceilometerclient.client.get_client', + return_value=self.api_client): + self.api_client.statistics.list.return_value = [] + self.evaluator.evaluate() + self.assertEqual(self.alarms[0].state, 'insufficient data') + self.assertEqual(self.alarms[1].state, 'ok') + self.api_client.alarms.update.assert_called_once_with( + self.alarms[0].alarm_id, + state='insufficient data' + ) + self.notifier.notify.assert_called_once_with( + self.alarms[0], + 'insufficient data', + mock.ANY + ) + + def test_simple_alarm_trip(self): + self._set_all_alarms('ok') + with mock.patch('ceilometerclient.client.get_client', + return_value=self.api_client): + avgs = [self._get_stat('avg', self.alarms[0].threshold + v) + for v in xrange(1, 6)] + maxs = [self._get_stat('max', self.alarms[1].threshold - v) + for v in xrange(4)] + self.api_client.statistics.list.side_effect = [avgs, maxs] + self.evaluator.evaluate() + self._assert_all_alarms('alarm') + expected = [mock.call(alarm.alarm_id, state='alarm') + for alarm in self.alarms] + update_calls = self.api_client.alarms.update.call_args_list + self.assertEqual(update_calls, expected) + reasons = ['Transition to alarm due to 5 samples outside' + ' threshold, most recent: 85.0', + 'Transition to alarm due to 4 samples outside' + ' threshold, most recent: 7.0'] + expected = [mock.call(alarm, 'alarm', reason) + for alarm, reason in zip(self.alarms, reasons)] + self.assertEqual(self.notifier.notify.call_args_list, expected) + + def test_simple_alarm_clear(self): + self._set_all_alarms('alarm') + with mock.patch('ceilometerclient.client.get_client', + return_value=self.api_client): + avgs = [self._get_stat('avg', self.alarms[0].threshold - v) + for v in xrange(5)] + maxs = [self._get_stat('max', self.alarms[1].threshold + v) + for v in xrange(1, 5)] + self.api_client.statistics.list.side_effect = [avgs, maxs] + self.evaluator.evaluate() + self._assert_all_alarms('ok') + expected = [mock.call(alarm.alarm_id, state='ok') + for alarm in self.alarms] + update_calls = self.api_client.alarms.update.call_args_list + self.assertEqual(update_calls, expected) + reasons = ['Transition to ok due to 5 samples inside' + ' threshold, most recent: 76.0', + 'Transition to ok due to 4 samples inside' + ' threshold, most recent: 14.0'] + expected = [mock.call(alarm, 'ok', reason) + for alarm, reason in zip(self.alarms, reasons)] + self.assertEqual(self.notifier.notify.call_args_list, expected) + + def test_equivocal_from_known_state(self): + self._set_all_alarms('ok') + with mock.patch('ceilometerclient.client.get_client', + return_value=self.api_client): + avgs = [self._get_stat('avg', self.alarms[0].threshold + v) + for v in xrange(5)] + maxs = [self._get_stat('max', self.alarms[1].threshold - v) + for v in xrange(-1, 3)] + self.api_client.statistics.list.side_effect = [avgs, maxs] + self.evaluator.evaluate() + self._assert_all_alarms('ok') + self.assertEqual(self.api_client.alarms.update.call_args_list, + []) + self.assertEqual(self.notifier.notify.call_args_list, []) + + def test_equivocal_from_unknown(self): + self._set_all_alarms('insufficient data') + with mock.patch('ceilometerclient.client.get_client', + return_value=self.api_client): + avgs = [self._get_stat('avg', self.alarms[0].threshold + v) + for v in xrange(1, 6)] + maxs = [self._get_stat('max', self.alarms[1].threshold - v) + for v in xrange(4)] + self.api_client.statistics.list.side_effect = [avgs, maxs] + self.evaluator.evaluate() + self._assert_all_alarms('alarm') + expected = [mock.call(alarm.alarm_id, state='alarm') + for alarm in self.alarms] + update_calls = self.api_client.alarms.update.call_args_list + self.assertEqual(update_calls, expected) + reasons = ['Transition to alarm due to 5 samples outside' + ' threshold, most recent: 85.0', + 'Transition to alarm due to 4 samples outside' + ' threshold, most recent: 7.0'] + expected = [mock.call(alarm, 'alarm', reason) + for alarm, reason in zip(self.alarms, reasons)] + self.assertEqual(self.notifier.notify.call_args_list, expected)