Merge "Exclude weak datapoints from alarm threshold evaluation"
This commit is contained in:
commit
447573f8c9
@ -21,6 +21,7 @@ import datetime
|
||||
import operator
|
||||
|
||||
from ceilometer.alarm import evaluator
|
||||
from ceilometer.alarm.evaluator import utils
|
||||
from ceilometer.openstack.common.gettextutils import _ # noqa
|
||||
from ceilometer.openstack.common import log
|
||||
from ceilometer.openstack.common import timeutils
|
||||
@ -51,8 +52,13 @@ class ThresholdEvaluator(evaluator.Evaluator):
|
||||
def _bound_duration(cls, alarm, constraints):
|
||||
"""Bound the duration of the statistics query."""
|
||||
now = timeutils.utcnow()
|
||||
# when exclusion of weak datapoints is enabled, we extend
|
||||
# the look-back period so as to allow a clearer sample count
|
||||
# trend to be established
|
||||
look_back = (cls.look_back if not alarm.rule.get('exclude_outliers')
|
||||
else alarm.rule['evaluation_periods'])
|
||||
window = (alarm.rule['period'] *
|
||||
(alarm.rule['evaluation_periods'] + cls.look_back))
|
||||
(alarm.rule['evaluation_periods'] + look_back))
|
||||
start = now - datetime.timedelta(seconds=window)
|
||||
LOG.debug(_('query stats from %(start)s to '
|
||||
'%(now)s') % {'start': start, 'now': now})
|
||||
@ -64,13 +70,25 @@ class ThresholdEvaluator(evaluator.Evaluator):
|
||||
@staticmethod
|
||||
def _sanitize(alarm, statistics):
|
||||
"""Sanitize statistics.
|
||||
Ultimately this will be the hook for the exclusion of chaotic
|
||||
datapoints for example.
|
||||
"""
|
||||
LOG.debug(_('sanitize stats %s') % statistics)
|
||||
if alarm.rule.get('exclude_outliers'):
|
||||
key = operator.attrgetter('count')
|
||||
mean = utils.mean(statistics, key)
|
||||
stddev = utils.stddev(statistics, key, mean)
|
||||
lower = mean - 2 * stddev
|
||||
upper = mean + 2 * stddev
|
||||
inliers, outliers = utils.anomolies(statistics, key, lower, upper)
|
||||
if outliers:
|
||||
LOG.debug(_('excluded weak datapoints with sample counts %s'),
|
||||
[s.count for s in outliers])
|
||||
statistics = inliers
|
||||
else:
|
||||
LOG.debug('no excluded weak datapoints')
|
||||
|
||||
# in practice statistics are always sorted by period start, not
|
||||
# strictly required by the API though
|
||||
statistics = statistics[:alarm.rule['evaluation_periods']]
|
||||
statistics = statistics[-alarm.rule['evaluation_periods']:]
|
||||
LOG.debug(_('pruned statistics to %d') % len(statistics))
|
||||
return statistics
|
||||
|
||||
|
67
ceilometer/alarm/evaluator/utils.py
Normal file
67
ceilometer/alarm/evaluator/utils.py
Normal file
@ -0,0 +1,67 @@
|
||||
# -*- encoding: utf-8 -*-
|
||||
#
|
||||
# Copyright © 2014 Red Hat, Inc
|
||||
#
|
||||
# Author: Eoghan Glynn <eglynn@redhat.com>
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import math
|
||||
|
||||
|
||||
def mean(s, key=lambda x: x):
|
||||
"""Calculate the mean of a numeric list.
|
||||
"""
|
||||
count = float(len(s))
|
||||
if count:
|
||||
return math.fsum(map(key, s)) / count
|
||||
return 0.0
|
||||
|
||||
|
||||
def deltas(s, key, m=None):
|
||||
"""Calculate the squared distances from mean for a numeric list.
|
||||
"""
|
||||
m = m or mean(s, key)
|
||||
return [(key(i) - m) ** 2 for i in s]
|
||||
|
||||
|
||||
def variance(s, key, m=None):
|
||||
"""Calculate the variance of a numeric list.
|
||||
"""
|
||||
return mean(deltas(s, key, m))
|
||||
|
||||
|
||||
def stddev(s, key, m=None):
|
||||
"""Calculate the standard deviation of a numeric list.
|
||||
"""
|
||||
return math.sqrt(variance(s, key, m))
|
||||
|
||||
|
||||
def outside(s, key, lower=0.0, upper=0.0):
|
||||
"""Determine if value falls outside upper and lower bounds.
|
||||
"""
|
||||
v = key(s)
|
||||
return v < lower or v > upper
|
||||
|
||||
|
||||
def anomolies(s, key, lower=0.0, upper=0.0):
|
||||
"""Separate anomolous datapoints from the inliers.
|
||||
"""
|
||||
inliers = []
|
||||
outliers = []
|
||||
for i in s:
|
||||
if outside(i, key, lower, upper):
|
||||
outliers.append(i)
|
||||
else:
|
||||
inliers.append(i)
|
||||
return inliers, outliers
|
@ -1111,6 +1111,9 @@ class AlarmThresholdRule(_Base):
|
||||
evaluation_periods = wsme.wsattr(BoundedInt(min=1), default=1)
|
||||
"The number of historical periods to evaluate the threshold"
|
||||
|
||||
exclude_outliers = wsme.wsattr(bool, default=False)
|
||||
"Whether datapoints with anomolously low sample counts are excluded"
|
||||
|
||||
def __init__(self, query=None, **kwargs):
|
||||
if query:
|
||||
query = [Query(**q) for q in query]
|
||||
@ -1143,7 +1146,8 @@ class AlarmThresholdRule(_Base):
|
||||
def as_dict(self):
|
||||
rule = self.as_dict_from_keys(['period', 'comparison_operator',
|
||||
'threshold', 'statistic',
|
||||
'evaluation_periods', 'meter_name'])
|
||||
'evaluation_periods', 'meter_name',
|
||||
'exclude_outliers'])
|
||||
rule['query'] = [q.as_dict() for q in self.query]
|
||||
return rule
|
||||
|
||||
|
@ -96,8 +96,12 @@ class TestEvaluate(base.TestEvaluatorBase):
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def _get_stat(attr, value):
|
||||
return statistics.Statistics(None, {attr: value})
|
||||
def _get_stat(attr, value, count=1):
|
||||
return statistics.Statistics(None, {attr: value, 'count': count})
|
||||
|
||||
def _set_all_rules(self, field, value):
|
||||
for alarm in self.alarms:
|
||||
alarm.rule[field] = value
|
||||
|
||||
def test_retry_transient_api_failure(self):
|
||||
with mock.patch('ceilometerclient.client.get_client',
|
||||
@ -150,9 +154,9 @@ class TestEvaluate(base.TestEvaluatorBase):
|
||||
update_calls = self.api_client.alarms.set_state.call_args_list
|
||||
self.assertEqual(update_calls, expected)
|
||||
reasons = ['Transition to alarm due to 5 samples outside'
|
||||
' threshold, most recent: 85.0',
|
||||
' threshold, most recent: %s' % avgs[-1].avg,
|
||||
'Transition to alarm due to 4 samples outside'
|
||||
' threshold, most recent: 7.0']
|
||||
' threshold, most recent: %s' % maxs[-1].max]
|
||||
expected = [mock.call(alarm, 'ok', reason)
|
||||
for alarm, reason in zip(self.alarms, reasons)]
|
||||
self.assertEqual(self.notifier.notify.call_args_list, expected)
|
||||
@ -173,9 +177,9 @@ class TestEvaluate(base.TestEvaluatorBase):
|
||||
update_calls = self.api_client.alarms.set_state.call_args_list
|
||||
self.assertEqual(update_calls, expected)
|
||||
reasons = ['Transition to ok due to 5 samples inside'
|
||||
' threshold, most recent: 76.0',
|
||||
' threshold, most recent: %s' % avgs[-1].avg,
|
||||
'Transition to ok due to 4 samples inside'
|
||||
' threshold, most recent: 14.0']
|
||||
' threshold, most recent: %s' % maxs[-1].max]
|
||||
expected = [mock.call(alarm, 'alarm', reason)
|
||||
for alarm, reason in zip(self.alarms, reasons)]
|
||||
self.assertEqual(self.notifier.notify.call_args_list, expected)
|
||||
@ -251,9 +255,9 @@ class TestEvaluate(base.TestEvaluatorBase):
|
||||
update_calls = self.api_client.alarms.set_state.call_args_list
|
||||
self.assertEqual(update_calls, expected)
|
||||
reasons = ['Transition to alarm due to 5 samples outside'
|
||||
' threshold, most recent: 85.0',
|
||||
' threshold, most recent: %s' % avgs[-1].avg,
|
||||
'Transition to alarm due to 4 samples outside'
|
||||
' threshold, most recent: 7.0']
|
||||
' threshold, most recent: %s' % maxs[-1].max]
|
||||
expected = [mock.call(alarm, 'ok', reason)
|
||||
for alarm, reason in zip(self.alarms, reasons)]
|
||||
self.assertEqual(self.notifier.notify.call_args_list, expected)
|
||||
@ -274,25 +278,37 @@ class TestEvaluate(base.TestEvaluatorBase):
|
||||
update_calls = self.api_client.alarms.set_state.call_args_list
|
||||
self.assertEqual(update_calls, expected)
|
||||
reasons = ['Transition to alarm due to 5 samples outside'
|
||||
' threshold, most recent: 85.0',
|
||||
' threshold, most recent: %s' % avgs[-1].avg,
|
||||
'Transition to alarm due to 4 samples outside'
|
||||
' threshold, most recent: 7.0']
|
||||
' threshold, most recent: %s' % maxs[-1].max]
|
||||
expected = [mock.call(alarm, 'insufficient data', reason)
|
||||
for alarm, reason in zip(self.alarms, reasons)]
|
||||
self.assertEqual(self.notifier.notify.call_args_list, expected)
|
||||
|
||||
def test_bound_duration(self):
|
||||
def _do_test_bound_duration(self, start, exclude_outliers=None):
|
||||
alarm = self.alarms[0]
|
||||
if exclude_outliers is not None:
|
||||
alarm.rule['exclude_outliers'] = exclude_outliers
|
||||
timeutils.utcnow.override_time = datetime.datetime(2012, 7, 2, 10, 45)
|
||||
constraint = self.evaluator._bound_duration(self.alarms[0], [])
|
||||
constraint = self.evaluator._bound_duration(alarm, [])
|
||||
self.assertEqual(constraint, [
|
||||
{'field': 'timestamp',
|
||||
'op': 'le',
|
||||
'value': timeutils.utcnow().isoformat()},
|
||||
{'field': 'timestamp',
|
||||
'op': 'ge',
|
||||
'value': '2012-07-02T10:39:00'},
|
||||
'value': start},
|
||||
])
|
||||
|
||||
def test_bound_duration_outlier_exclusion_defaulted(self):
|
||||
self._do_test_bound_duration('2012-07-02T10:39:00')
|
||||
|
||||
def test_bound_duration_outlier_exclusion_clear(self):
|
||||
self._do_test_bound_duration('2012-07-02T10:39:00', False)
|
||||
|
||||
def test_bound_duration_outlier_exclusion_set(self):
|
||||
self._do_test_bound_duration('2012-07-02T10:35:00', True)
|
||||
|
||||
def test_threshold_endpoint_types(self):
|
||||
endpoint_types = ["internalURL", "publicURL"]
|
||||
for endpoint_type in endpoint_types:
|
||||
@ -313,3 +329,81 @@ class TestEvaluate(base.TestEvaluatorBase):
|
||||
os_endpoint_type=conf.os_endpoint_type)]
|
||||
actual = client.call_args_list
|
||||
self.assertEqual(actual, expected)
|
||||
|
||||
def _do_test_simple_alarm_trip_outlier_exclusion(self, exclude_outliers):
|
||||
self._set_all_rules('exclude_outliers', exclude_outliers)
|
||||
self._set_all_alarms('ok')
|
||||
with mock.patch('ceilometerclient.client.get_client',
|
||||
return_value=self.api_client):
|
||||
# most recent datapoints inside threshold but with
|
||||
# anomolously low sample count
|
||||
threshold = self.alarms[0].rule['threshold']
|
||||
avgs = [self._get_stat('avg',
|
||||
threshold + (v if v < 10 else -v),
|
||||
count=20 if v < 10 else 1)
|
||||
for v in xrange(1, 11)]
|
||||
threshold = self.alarms[1].rule['threshold']
|
||||
maxs = [self._get_stat('max',
|
||||
threshold - (v if v < 7 else -v),
|
||||
count=20 if v < 7 else 1)
|
||||
for v in xrange(8)]
|
||||
self.api_client.statistics.list.side_effect = [avgs, maxs]
|
||||
self._evaluate_all_alarms()
|
||||
self._assert_all_alarms('alarm' if exclude_outliers else 'ok')
|
||||
if exclude_outliers:
|
||||
expected = [mock.call(alarm.alarm_id, state='alarm')
|
||||
for alarm in self.alarms]
|
||||
update_calls = self.api_client.alarms.set_state.call_args_list
|
||||
self.assertEqual(update_calls, expected)
|
||||
reasons = ['Transition to alarm due to 5 samples outside'
|
||||
' threshold, most recent: %s' % avgs[-2].avg,
|
||||
'Transition to alarm due to 4 samples outside'
|
||||
' threshold, most recent: %s' % maxs[-2].max]
|
||||
expected = [mock.call(alarm, 'ok', reason)
|
||||
for alarm, reason in zip(self.alarms, reasons)]
|
||||
self.assertEqual(self.notifier.notify.call_args_list, expected)
|
||||
|
||||
def test_simple_alarm_trip_with_outlier_exclusion(self):
|
||||
self. _do_test_simple_alarm_trip_outlier_exclusion(True)
|
||||
|
||||
def test_simple_alarm_no_trip_without_outlier_exclusion(self):
|
||||
self. _do_test_simple_alarm_trip_outlier_exclusion(False)
|
||||
|
||||
def _do_test_simple_alarm_clear_outlier_exclusion(self, exclude_outliers):
|
||||
self._set_all_rules('exclude_outliers', exclude_outliers)
|
||||
self._set_all_alarms('alarm')
|
||||
with mock.patch('ceilometerclient.client.get_client',
|
||||
return_value=self.api_client):
|
||||
# most recent datapoints outside threshold but with
|
||||
# anomolously low sample count
|
||||
threshold = self.alarms[0].rule['threshold']
|
||||
avgs = [self._get_stat('avg',
|
||||
threshold - (v if v < 9 else -v),
|
||||
count=20 if v < 9 else 1)
|
||||
for v in xrange(10)]
|
||||
threshold = self.alarms[1].rule['threshold']
|
||||
maxs = [self._get_stat('max',
|
||||
threshold + (v if v < 8 else -v),
|
||||
count=20 if v < 8 else 1)
|
||||
for v in xrange(1, 9)]
|
||||
self.api_client.statistics.list.side_effect = [avgs, maxs]
|
||||
self._evaluate_all_alarms()
|
||||
self._assert_all_alarms('ok' if exclude_outliers else 'alarm')
|
||||
if exclude_outliers:
|
||||
expected = [mock.call(alarm.alarm_id, state='ok')
|
||||
for alarm in self.alarms]
|
||||
update_calls = self.api_client.alarms.set_state.call_args_list
|
||||
self.assertEqual(update_calls, expected)
|
||||
reasons = ['Transition to ok due to 5 samples inside'
|
||||
' threshold, most recent: %s' % avgs[-2].avg,
|
||||
'Transition to ok due to 4 samples inside'
|
||||
' threshold, most recent: %s' % maxs[-2].max]
|
||||
expected = [mock.call(alarm, 'alarm', reason)
|
||||
for alarm, reason in zip(self.alarms, reasons)]
|
||||
self.assertEqual(self.notifier.notify.call_args_list, expected)
|
||||
|
||||
def test_simple_alarm_clear_with_outlier_exclusion(self):
|
||||
self. _do_test_simple_alarm_clear_outlier_exclusion(True)
|
||||
|
||||
def test_simple_alarm_no_clear_without_outlier_exclusion(self):
|
||||
self. _do_test_simple_alarm_clear_outlier_exclusion(False)
|
||||
|
@ -150,6 +150,23 @@ class TestAlarms(FunctionalTest,
|
||||
)]:
|
||||
self.conn.update_alarm(alarm)
|
||||
|
||||
@staticmethod
|
||||
def _add_default_threshold_rule(alarm):
|
||||
if 'exclude_outliers' not in alarm['threshold_rule']:
|
||||
alarm['threshold_rule']['exclude_outliers'] = False
|
||||
|
||||
def _verify_alarm(self, json, alarm, expected_name=None):
|
||||
if expected_name and alarm.name != expected_name:
|
||||
self.fail("Alarm not found")
|
||||
self._add_default_threshold_rule(json)
|
||||
for key in json:
|
||||
if key.endswith('_rule'):
|
||||
storage_key = 'rule'
|
||||
else:
|
||||
storage_key = key
|
||||
self.assertEqual(getattr(alarm, storage_key),
|
||||
json[key])
|
||||
|
||||
def test_list_alarms(self):
|
||||
data = self.get_json('/alarms')
|
||||
self.assertEqual(4, len(data))
|
||||
@ -394,6 +411,7 @@ class TestAlarms(FunctionalTest,
|
||||
}
|
||||
|
||||
}
|
||||
self._add_default_threshold_rule(to_check)
|
||||
|
||||
json = {
|
||||
'name': 'added_alarm_defaults',
|
||||
@ -420,7 +438,7 @@ class TestAlarms(FunctionalTest,
|
||||
else:
|
||||
self.fail("Alarm not found")
|
||||
|
||||
def test_post_alarm(self):
|
||||
def _do_test_post_alarm(self, exclude_outliers=None):
|
||||
json = {
|
||||
'enabled': False,
|
||||
'name': 'added_alarm',
|
||||
@ -443,6 +461,9 @@ class TestAlarms(FunctionalTest,
|
||||
'period': '180',
|
||||
}
|
||||
}
|
||||
if exclude_outliers is not None:
|
||||
json['threshold_rule']['exclude_outliers'] = exclude_outliers
|
||||
|
||||
self.post_json('/alarms', params=json, status=201,
|
||||
headers=self.auth_headers)
|
||||
alarms = list(self.conn.get_alarms(enabled=False))
|
||||
@ -453,16 +474,16 @@ class TestAlarms(FunctionalTest,
|
||||
# to check to BoundedInt type conversion
|
||||
json['threshold_rule']['evaluation_periods'] = 3
|
||||
json['threshold_rule']['period'] = 180
|
||||
if alarms[0].name == 'added_alarm':
|
||||
for key in json:
|
||||
if key.endswith('_rule'):
|
||||
storage_key = 'rule'
|
||||
else:
|
||||
storage_key = key
|
||||
self.assertEqual(getattr(alarms[0], storage_key),
|
||||
json[key])
|
||||
else:
|
||||
self.fail("Alarm not found")
|
||||
self._verify_alarm(json, alarms[0], 'added_alarm')
|
||||
|
||||
def test_post_alarm_outlier_exclusion_set(self):
|
||||
self._do_test_post_alarm(True)
|
||||
|
||||
def test_post_alarm_outlier_exclusion_clear(self):
|
||||
self._do_test_post_alarm(False)
|
||||
|
||||
def test_post_alarm_outlier_exclusion_defaulted(self):
|
||||
self._do_test_post_alarm()
|
||||
|
||||
def _do_test_post_alarm_as_admin(self, explicit_project_constraint):
|
||||
"""Test the creation of an alarm as admin for another project."""
|
||||
@ -499,6 +520,7 @@ class TestAlarms(FunctionalTest,
|
||||
self.assertEqual(1, len(alarms))
|
||||
self.assertEqual(alarms[0].user_id, 'auseridthatisnotmine')
|
||||
self.assertEqual(alarms[0].project_id, 'aprojectidthatisnotmine')
|
||||
self._add_default_threshold_rule(json)
|
||||
if alarms[0].name == 'added_alarm':
|
||||
for key in json:
|
||||
if key.endswith('_rule'):
|
||||
@ -566,16 +588,7 @@ class TestAlarms(FunctionalTest,
|
||||
self.assertEqual(1, len(alarms))
|
||||
self.assertEqual(alarms[0].user_id, self.auth_headers['X-User-Id'])
|
||||
self.assertEqual(alarms[0].project_id, 'aprojectidthatisnotmine')
|
||||
if alarms[0].name == 'added_alarm':
|
||||
for key in json:
|
||||
if key.endswith('_rule'):
|
||||
storage_key = 'rule'
|
||||
else:
|
||||
storage_key = key
|
||||
self.assertEqual(getattr(alarms[0], storage_key),
|
||||
json[key])
|
||||
else:
|
||||
self.fail("Alarm not found")
|
||||
self._verify_alarm(json, alarms[0], 'added_alarm')
|
||||
|
||||
def test_post_alarm_as_admin_no_project(self):
|
||||
"""Test the creation of an alarm as admin for another project but
|
||||
@ -612,16 +625,7 @@ class TestAlarms(FunctionalTest,
|
||||
self.assertEqual(alarms[0].user_id, 'auseridthatisnotmine')
|
||||
self.assertEqual(alarms[0].project_id,
|
||||
self.auth_headers['X-Project-Id'])
|
||||
if alarms[0].name == 'added_alarm':
|
||||
for key in json:
|
||||
if key.endswith('_rule'):
|
||||
storage_key = 'rule'
|
||||
else:
|
||||
storage_key = key
|
||||
self.assertEqual(getattr(alarms[0], storage_key),
|
||||
json[key])
|
||||
else:
|
||||
self.fail("Alarm not found")
|
||||
self._verify_alarm(json, alarms[0], 'added_alarm')
|
||||
|
||||
def test_post_alarm_combination(self):
|
||||
json = {
|
||||
@ -818,12 +822,7 @@ class TestAlarms(FunctionalTest,
|
||||
json['threshold_rule']['query'].append({
|
||||
'field': 'project_id', 'op': 'eq',
|
||||
'value': self.auth_headers['X-Project-Id']})
|
||||
for key in json:
|
||||
if key.endswith('_rule'):
|
||||
storage_key = 'rule'
|
||||
else:
|
||||
storage_key = key
|
||||
self.assertEqual(getattr(alarm, storage_key), json[key])
|
||||
self._verify_alarm(json, alarm)
|
||||
|
||||
def test_put_alarm_as_admin(self):
|
||||
json = {
|
||||
@ -870,12 +869,7 @@ class TestAlarms(FunctionalTest,
|
||||
alarm = list(self.conn.get_alarms(alarm_id=alarm_id, enabled=False))[0]
|
||||
self.assertEqual(alarm.user_id, 'myuserid')
|
||||
self.assertEqual(alarm.project_id, 'myprojectid')
|
||||
for key in json:
|
||||
if key.endswith('_rule'):
|
||||
storage_key = 'rule'
|
||||
else:
|
||||
storage_key = key
|
||||
self.assertEqual(getattr(alarm, storage_key), json[key])
|
||||
self._verify_alarm(json, alarm)
|
||||
|
||||
def test_put_alarm_wrong_field(self):
|
||||
# Note: wsme will ignore unknown fields so will just not appear in
|
||||
@ -1046,6 +1040,7 @@ class TestAlarms(FunctionalTest,
|
||||
type='creation',
|
||||
user_id=alarm['user_id']),
|
||||
history[0])
|
||||
self._add_default_threshold_rule(new_alarm)
|
||||
new_alarm['rule'] = new_alarm['threshold_rule']
|
||||
del new_alarm['threshold_rule']
|
||||
new_alarm['rule']['query'].append({
|
||||
@ -1118,6 +1113,7 @@ class TestAlarms(FunctionalTest,
|
||||
data = dict(state='alarm')
|
||||
self._update_alarm(alarm, data, auth_headers=admin_auth)
|
||||
|
||||
self._add_default_threshold_rule(new_alarm)
|
||||
new_alarm['rule'] = new_alarm['threshold_rule']
|
||||
del new_alarm['threshold_rule']
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user