Allow to extends the evaluator lookback window
Sometimes alarm state is flapping we just missing the last datapoint often. This can be solved by increase dedicated to the metric injection chain or for less critical scenario, we could allow a bigger lookback window. This change allows to extends the lookback window with the new configuration option 'acceptable_ingestion_lag'. Change-Id: If2aca73aea95c0c6d08afa5fbb89b949099507db Closes-bug: #1540298 Closes-bug: #1506911
This commit is contained in:
parent
06204adac4
commit
b3874c47f1
@ -19,6 +19,7 @@ import operator
|
||||
import six
|
||||
|
||||
from ceilometerclient import client as ceiloclient
|
||||
from oslo_config import cfg
|
||||
from oslo_log import log
|
||||
from oslo_utils import timeutils
|
||||
|
||||
@ -38,11 +39,20 @@ COMPARATORS = {
|
||||
'ne': operator.ne,
|
||||
}
|
||||
|
||||
OPTS = [
|
||||
cfg.IntOpt('additional_ingestion_lag',
|
||||
min=0,
|
||||
default=0,
|
||||
help='The number of seconds to extend the evaluation windows '
|
||||
'to compensate the reporting/ingestion lag.')
|
||||
]
|
||||
|
||||
|
||||
class ThresholdEvaluator(evaluator.Evaluator):
|
||||
|
||||
# the sliding evaluation window is extended to allow
|
||||
# for reporting/ingestion lag
|
||||
# the reporting/ingestion lag this can be increased
|
||||
# with 'additional_ingestion_lag' seconds if needed.
|
||||
look_back = 1
|
||||
|
||||
def __init__(self, conf):
|
||||
@ -63,17 +73,17 @@ class ThresholdEvaluator(evaluator.Evaluator):
|
||||
|
||||
return self._cm_client
|
||||
|
||||
@classmethod
|
||||
def _bound_duration(cls, rule):
|
||||
def _bound_duration(self, rule):
|
||||
"""Bound the duration of the statistics query."""
|
||||
now = timeutils.utcnow()
|
||||
# when exclusion of weak datapoints is enabled, we extend
|
||||
# the look-back period so as to allow a clearer sample count
|
||||
# trend to be established
|
||||
look_back = (cls.look_back if not rule.get('exclude_outliers')
|
||||
look_back = (self.look_back if not rule.get('exclude_outliers')
|
||||
else rule['evaluation_periods'])
|
||||
window = ((rule.get('period', None) or rule['granularity'])
|
||||
* (rule['evaluation_periods'] + look_back))
|
||||
* (rule['evaluation_periods'] + look_back) +
|
||||
self.conf.additional_ingestion_lag)
|
||||
start = now - datetime.timedelta(seconds=window)
|
||||
LOG.debug('query stats from %(start)s to '
|
||||
'%(now)s', {'start': start, 'now': now})
|
||||
|
@ -35,6 +35,7 @@ def list_opts():
|
||||
itertools.chain(
|
||||
aodh.evaluator.OPTS,
|
||||
aodh.evaluator.event.OPTS,
|
||||
aodh.evaluator.threshold.OPTS,
|
||||
aodh.notifier.rest.OPTS,
|
||||
aodh.queue.OPTS,
|
||||
aodh.service.OPTS)),
|
||||
|
@ -193,6 +193,56 @@ class TestEvaluate(base.TestEvaluatorBase):
|
||||
in zip(self.alarms, reasons, reason_datas)]
|
||||
self.assertEqual(expected, self.notifier.notify.call_args_list)
|
||||
|
||||
@mock.patch.object(timeutils, 'utcnow')
|
||||
def test_lag_configuration(self, mock_utcnow):
|
||||
mock_utcnow.return_value = datetime.datetime(2012, 7, 2, 10, 45)
|
||||
self.api_client.statistics.list.side_effect = []
|
||||
|
||||
self._set_all_alarms('ok')
|
||||
self._evaluate_all_alarms()
|
||||
self._set_all_alarms('ok')
|
||||
self.conf.set_override("additional_ingestion_lag", 42)
|
||||
self._evaluate_all_alarms()
|
||||
|
||||
self.assertEqual([
|
||||
mock.call(
|
||||
meter_name='cpu_util', period=60,
|
||||
q=[{'value': 'cpu_util', 'op': 'eq', 'field': 'meter'},
|
||||
{'value': 'my_instance', 'op': 'eq',
|
||||
'field': 'resource_id'},
|
||||
{'value': '2012-07-02T10:45:00', 'op': 'le',
|
||||
'field': 'timestamp'},
|
||||
{'value': '2012-07-02T10:39:00', 'op': 'ge',
|
||||
'field': 'timestamp'}]),
|
||||
mock.call(
|
||||
meter_name='cpu_util', period=300,
|
||||
q=[{'value': 'cpu_util', 'op': 'eq', 'field': 'meter'},
|
||||
{'value': 'my_group', 'op': 'eq',
|
||||
'field': 'metadata.user_metadata.AS'},
|
||||
{'value': '2012-07-02T10:45:00', 'op': 'le',
|
||||
'field': 'timestamp'},
|
||||
{'value': '2012-07-02T10:20:00', 'op': 'ge',
|
||||
'field': 'timestamp'}]),
|
||||
mock.call(
|
||||
meter_name='cpu_util', period=60,
|
||||
q=[{'value': 'cpu_util', 'op': 'eq', 'field': 'meter'},
|
||||
{'value': 'my_instance', 'op': 'eq',
|
||||
'field': 'resource_id'},
|
||||
{'value': '2012-07-02T10:45:00', 'op': 'le',
|
||||
'field': 'timestamp'},
|
||||
{'value': '2012-07-02T10:38:18', 'op': 'ge',
|
||||
'field': 'timestamp'}]),
|
||||
mock.call(
|
||||
meter_name='cpu_util', period=300,
|
||||
q=[{'value': 'cpu_util', 'op': 'eq', 'field': 'meter'},
|
||||
{'value': 'my_group', 'op': 'eq',
|
||||
'field': 'metadata.user_metadata.AS'},
|
||||
{'value': '2012-07-02T10:45:00', 'op': 'le',
|
||||
'field': 'timestamp'},
|
||||
{'value': '2012-07-02T10:19:18', 'op': 'ge',
|
||||
'field': 'timestamp'}])],
|
||||
self.api_client.statistics.list.mock_calls)
|
||||
|
||||
def test_simple_alarm_clear(self):
|
||||
self._set_all_alarms('alarm')
|
||||
avgs = [self._get_stat('avg', self.alarms[0].rule['threshold'] - v)
|
||||
|
7
releasenotes/notes/ingestion-lag-2317725887287fbc.yaml
Normal file
7
releasenotes/notes/ingestion-lag-2317725887287fbc.yaml
Normal file
@ -0,0 +1,7 @@
|
||||
---
|
||||
features:
|
||||
- Allow to extends the alarm evaluation windows to to compensate the
|
||||
reporting/ingestion lag.
|
||||
|
||||
An new option is introduced additional_ingestion_lag defaulted to 0.
|
||||
It represents the number of seconds of the window extension.
|
Loading…
x
Reference in New Issue
Block a user