Allow to extends the evaluator lookback window

Sometimes alarm state is flapping we just missing the last datapoint often. This can be solved by increase dedicated to the metric injection chain or for less critical scenario, we could allow a bigger lookback window. This change allows to extends the lookback window with the new configuration option 'acceptable_ingestion_lag'. Change-Id: If2aca73aea95c0c6d08afa5fbb89b949099507db Closes-bug: #1540298 Closes-bug: #1506911
2016-02-08 08:57:44 +01:00 · 2016-02-08 08:57:44 +01:00 · b3874c47f1
commit b3874c47f1
parent 06204adac4
4 changed files with 73 additions and 5 deletions
--- a/aodh/evaluator/threshold.py
+++ b/aodh/evaluator/threshold.py
@ -19,6 +19,7 @@ import operator
 import six

 from ceilometerclient import client as ceiloclient
+from oslo_config import cfg
 from oslo_log import log
 from oslo_utils import timeutils

@ -38,11 +39,20 @@ COMPARATORS = {
    'ne': operator.ne,
 }

+OPTS = [
+    cfg.IntOpt('additional_ingestion_lag',
+               min=0,
+               default=0,
+               help='The number of seconds to extend the evaluation windows '
+               'to compensate the reporting/ingestion lag.')
+]
+

 class ThresholdEvaluator(evaluator.Evaluator):

    # the sliding evaluation window is extended to allow
-    # for reporting/ingestion lag
+    # the reporting/ingestion lag this can be increased
+    # with 'additional_ingestion_lag' seconds if needed.
    look_back = 1

    def __init__(self, conf):
@ -63,17 +73,17 @@ class ThresholdEvaluator(evaluator.Evaluator):

        return self._cm_client

-    @classmethod
-    def _bound_duration(cls, rule):
+    def _bound_duration(self, rule):
        """Bound the duration of the statistics query."""
        now = timeutils.utcnow()
        # when exclusion of weak datapoints is enabled, we extend
        # the look-back period so as to allow a clearer sample count
        # trend to be established
-        look_back = (cls.look_back if not rule.get('exclude_outliers')
+        look_back = (self.look_back if not rule.get('exclude_outliers')
                     else rule['evaluation_periods'])
        window = ((rule.get('period', None) or rule['granularity'])
-                  * (rule['evaluation_periods'] + look_back))
+                  * (rule['evaluation_periods'] + look_back) +
+                  self.conf.additional_ingestion_lag)
        start = now - datetime.timedelta(seconds=window)
        LOG.debug('query stats from %(start)s to '
                  '%(now)s', {'start': start, 'now': now})
--- a/aodh/opts.py
+++ b/aodh/opts.py
@ -35,6 +35,7 @@ def list_opts():
         itertools.chain(
             aodh.evaluator.OPTS,
             aodh.evaluator.event.OPTS,
+             aodh.evaluator.threshold.OPTS,
             aodh.notifier.rest.OPTS,
             aodh.queue.OPTS,
             aodh.service.OPTS)),
--- a/aodh/tests/unit/evaluator/test_threshold.py
+++ b/aodh/tests/unit/evaluator/test_threshold.py
@ -193,6 +193,56 @@ class TestEvaluate(base.TestEvaluatorBase):
                    in zip(self.alarms, reasons, reason_datas)]
        self.assertEqual(expected, self.notifier.notify.call_args_list)

+    @mock.patch.object(timeutils, 'utcnow')
+    def test_lag_configuration(self, mock_utcnow):
+        mock_utcnow.return_value = datetime.datetime(2012, 7, 2, 10, 45)
+        self.api_client.statistics.list.side_effect = []
+
+        self._set_all_alarms('ok')
+        self._evaluate_all_alarms()
+        self._set_all_alarms('ok')
+        self.conf.set_override("additional_ingestion_lag", 42)
+        self._evaluate_all_alarms()
+
+        self.assertEqual([
+            mock.call(
+                meter_name='cpu_util', period=60,
+                q=[{'value': 'cpu_util', 'op': 'eq', 'field': 'meter'},
+                   {'value': 'my_instance', 'op': 'eq',
+                    'field': 'resource_id'},
+                   {'value': '2012-07-02T10:45:00', 'op': 'le',
+                    'field': 'timestamp'},
+                   {'value': '2012-07-02T10:39:00', 'op': 'ge',
+                    'field': 'timestamp'}]),
+            mock.call(
+                meter_name='cpu_util', period=300,
+                q=[{'value': 'cpu_util', 'op': 'eq', 'field': 'meter'},
+                   {'value': 'my_group', 'op': 'eq',
+                    'field': 'metadata.user_metadata.AS'},
+                   {'value': '2012-07-02T10:45:00', 'op': 'le',
+                    'field': 'timestamp'},
+                   {'value': '2012-07-02T10:20:00', 'op': 'ge',
+                    'field': 'timestamp'}]),
+            mock.call(
+                meter_name='cpu_util', period=60,
+                q=[{'value': 'cpu_util', 'op': 'eq', 'field': 'meter'},
+                   {'value': 'my_instance', 'op': 'eq',
+                    'field': 'resource_id'},
+                   {'value': '2012-07-02T10:45:00', 'op': 'le',
+                    'field': 'timestamp'},
+                   {'value': '2012-07-02T10:38:18', 'op': 'ge',
+                    'field': 'timestamp'}]),
+            mock.call(
+                meter_name='cpu_util', period=300,
+                q=[{'value': 'cpu_util', 'op': 'eq', 'field': 'meter'},
+                   {'value': 'my_group', 'op': 'eq',
+                    'field': 'metadata.user_metadata.AS'},
+                   {'value': '2012-07-02T10:45:00', 'op': 'le',
+                    'field': 'timestamp'},
+                   {'value': '2012-07-02T10:19:18', 'op': 'ge',
+                    'field': 'timestamp'}])],
+            self.api_client.statistics.list.mock_calls)
+
    def test_simple_alarm_clear(self):
        self._set_all_alarms('alarm')
        avgs = [self._get_stat('avg', self.alarms[0].rule['threshold'] - v)
--- a/releasenotes/notes/ingestion-lag-2317725887287fbc.yaml
+++ b/releasenotes/notes/ingestion-lag-2317725887287fbc.yaml
@ -0,0 +1,7 @@
+---
+features:
+  - Allow to extends the alarm evaluation windows to to compensate the
+    reporting/ingestion lag.
+
+    An new option is introduced additional_ingestion_lag defaulted to 0.
+    It represents the number of seconds of the window extension.