Merge "Exclude weak datapoints from alarm threshold evaluation"

This commit is contained in:
Jenkins 2014-01-24 02:26:44 +00:00 committed by Gerrit Code Review
commit 447573f8c9
5 changed files with 240 additions and 61 deletions

View File

@ -21,6 +21,7 @@ import datetime
import operator
from ceilometer.alarm import evaluator
from ceilometer.alarm.evaluator import utils
from ceilometer.openstack.common.gettextutils import _ # noqa
from ceilometer.openstack.common import log
from ceilometer.openstack.common import timeutils
@ -51,8 +52,13 @@ class ThresholdEvaluator(evaluator.Evaluator):
def _bound_duration(cls, alarm, constraints):
"""Bound the duration of the statistics query."""
now = timeutils.utcnow()
# when exclusion of weak datapoints is enabled, we extend
# the look-back period so as to allow a clearer sample count
# trend to be established
look_back = (cls.look_back if not alarm.rule.get('exclude_outliers')
else alarm.rule['evaluation_periods'])
window = (alarm.rule['period'] *
(alarm.rule['evaluation_periods'] + cls.look_back))
(alarm.rule['evaluation_periods'] + look_back))
start = now - datetime.timedelta(seconds=window)
LOG.debug(_('query stats from %(start)s to '
'%(now)s') % {'start': start, 'now': now})
@ -64,13 +70,25 @@ class ThresholdEvaluator(evaluator.Evaluator):
@staticmethod
def _sanitize(alarm, statistics):
"""Sanitize statistics.
Ultimately this will be the hook for the exclusion of chaotic
datapoints for example.
"""
LOG.debug(_('sanitize stats %s') % statistics)
if alarm.rule.get('exclude_outliers'):
key = operator.attrgetter('count')
mean = utils.mean(statistics, key)
stddev = utils.stddev(statistics, key, mean)
lower = mean - 2 * stddev
upper = mean + 2 * stddev
inliers, outliers = utils.anomolies(statistics, key, lower, upper)
if outliers:
LOG.debug(_('excluded weak datapoints with sample counts %s'),
[s.count for s in outliers])
statistics = inliers
else:
LOG.debug('no excluded weak datapoints')
# in practice statistics are always sorted by period start, not
# strictly required by the API though
statistics = statistics[:alarm.rule['evaluation_periods']]
statistics = statistics[-alarm.rule['evaluation_periods']:]
LOG.debug(_('pruned statistics to %d') % len(statistics))
return statistics

View File

@ -0,0 +1,67 @@
# -*- encoding: utf-8 -*-
#
# Copyright © 2014 Red Hat, Inc
#
# Author: Eoghan Glynn <eglynn@redhat.com>
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import math
def mean(s, key=lambda x: x):
"""Calculate the mean of a numeric list.
"""
count = float(len(s))
if count:
return math.fsum(map(key, s)) / count
return 0.0
def deltas(s, key, m=None):
"""Calculate the squared distances from mean for a numeric list.
"""
m = m or mean(s, key)
return [(key(i) - m) ** 2 for i in s]
def variance(s, key, m=None):
"""Calculate the variance of a numeric list.
"""
return mean(deltas(s, key, m))
def stddev(s, key, m=None):
"""Calculate the standard deviation of a numeric list.
"""
return math.sqrt(variance(s, key, m))
def outside(s, key, lower=0.0, upper=0.0):
"""Determine if value falls outside upper and lower bounds.
"""
v = key(s)
return v < lower or v > upper
def anomolies(s, key, lower=0.0, upper=0.0):
"""Separate anomolous datapoints from the inliers.
"""
inliers = []
outliers = []
for i in s:
if outside(i, key, lower, upper):
outliers.append(i)
else:
inliers.append(i)
return inliers, outliers

View File

@ -1111,6 +1111,9 @@ class AlarmThresholdRule(_Base):
evaluation_periods = wsme.wsattr(BoundedInt(min=1), default=1)
"The number of historical periods to evaluate the threshold"
exclude_outliers = wsme.wsattr(bool, default=False)
"Whether datapoints with anomolously low sample counts are excluded"
def __init__(self, query=None, **kwargs):
if query:
query = [Query(**q) for q in query]
@ -1143,7 +1146,8 @@ class AlarmThresholdRule(_Base):
def as_dict(self):
rule = self.as_dict_from_keys(['period', 'comparison_operator',
'threshold', 'statistic',
'evaluation_periods', 'meter_name'])
'evaluation_periods', 'meter_name',
'exclude_outliers'])
rule['query'] = [q.as_dict() for q in self.query]
return rule

View File

@ -96,8 +96,12 @@ class TestEvaluate(base.TestEvaluatorBase):
]
@staticmethod
def _get_stat(attr, value):
return statistics.Statistics(None, {attr: value})
def _get_stat(attr, value, count=1):
return statistics.Statistics(None, {attr: value, 'count': count})
def _set_all_rules(self, field, value):
for alarm in self.alarms:
alarm.rule[field] = value
def test_retry_transient_api_failure(self):
with mock.patch('ceilometerclient.client.get_client',
@ -150,9 +154,9 @@ class TestEvaluate(base.TestEvaluatorBase):
update_calls = self.api_client.alarms.set_state.call_args_list
self.assertEqual(update_calls, expected)
reasons = ['Transition to alarm due to 5 samples outside'
' threshold, most recent: 85.0',
' threshold, most recent: %s' % avgs[-1].avg,
'Transition to alarm due to 4 samples outside'
' threshold, most recent: 7.0']
' threshold, most recent: %s' % maxs[-1].max]
expected = [mock.call(alarm, 'ok', reason)
for alarm, reason in zip(self.alarms, reasons)]
self.assertEqual(self.notifier.notify.call_args_list, expected)
@ -173,9 +177,9 @@ class TestEvaluate(base.TestEvaluatorBase):
update_calls = self.api_client.alarms.set_state.call_args_list
self.assertEqual(update_calls, expected)
reasons = ['Transition to ok due to 5 samples inside'
' threshold, most recent: 76.0',
' threshold, most recent: %s' % avgs[-1].avg,
'Transition to ok due to 4 samples inside'
' threshold, most recent: 14.0']
' threshold, most recent: %s' % maxs[-1].max]
expected = [mock.call(alarm, 'alarm', reason)
for alarm, reason in zip(self.alarms, reasons)]
self.assertEqual(self.notifier.notify.call_args_list, expected)
@ -251,9 +255,9 @@ class TestEvaluate(base.TestEvaluatorBase):
update_calls = self.api_client.alarms.set_state.call_args_list
self.assertEqual(update_calls, expected)
reasons = ['Transition to alarm due to 5 samples outside'
' threshold, most recent: 85.0',
' threshold, most recent: %s' % avgs[-1].avg,
'Transition to alarm due to 4 samples outside'
' threshold, most recent: 7.0']
' threshold, most recent: %s' % maxs[-1].max]
expected = [mock.call(alarm, 'ok', reason)
for alarm, reason in zip(self.alarms, reasons)]
self.assertEqual(self.notifier.notify.call_args_list, expected)
@ -274,25 +278,37 @@ class TestEvaluate(base.TestEvaluatorBase):
update_calls = self.api_client.alarms.set_state.call_args_list
self.assertEqual(update_calls, expected)
reasons = ['Transition to alarm due to 5 samples outside'
' threshold, most recent: 85.0',
' threshold, most recent: %s' % avgs[-1].avg,
'Transition to alarm due to 4 samples outside'
' threshold, most recent: 7.0']
' threshold, most recent: %s' % maxs[-1].max]
expected = [mock.call(alarm, 'insufficient data', reason)
for alarm, reason in zip(self.alarms, reasons)]
self.assertEqual(self.notifier.notify.call_args_list, expected)
def test_bound_duration(self):
def _do_test_bound_duration(self, start, exclude_outliers=None):
alarm = self.alarms[0]
if exclude_outliers is not None:
alarm.rule['exclude_outliers'] = exclude_outliers
timeutils.utcnow.override_time = datetime.datetime(2012, 7, 2, 10, 45)
constraint = self.evaluator._bound_duration(self.alarms[0], [])
constraint = self.evaluator._bound_duration(alarm, [])
self.assertEqual(constraint, [
{'field': 'timestamp',
'op': 'le',
'value': timeutils.utcnow().isoformat()},
{'field': 'timestamp',
'op': 'ge',
'value': '2012-07-02T10:39:00'},
'value': start},
])
def test_bound_duration_outlier_exclusion_defaulted(self):
self._do_test_bound_duration('2012-07-02T10:39:00')
def test_bound_duration_outlier_exclusion_clear(self):
self._do_test_bound_duration('2012-07-02T10:39:00', False)
def test_bound_duration_outlier_exclusion_set(self):
self._do_test_bound_duration('2012-07-02T10:35:00', True)
def test_threshold_endpoint_types(self):
endpoint_types = ["internalURL", "publicURL"]
for endpoint_type in endpoint_types:
@ -313,3 +329,81 @@ class TestEvaluate(base.TestEvaluatorBase):
os_endpoint_type=conf.os_endpoint_type)]
actual = client.call_args_list
self.assertEqual(actual, expected)
def _do_test_simple_alarm_trip_outlier_exclusion(self, exclude_outliers):
self._set_all_rules('exclude_outliers', exclude_outliers)
self._set_all_alarms('ok')
with mock.patch('ceilometerclient.client.get_client',
return_value=self.api_client):
# most recent datapoints inside threshold but with
# anomolously low sample count
threshold = self.alarms[0].rule['threshold']
avgs = [self._get_stat('avg',
threshold + (v if v < 10 else -v),
count=20 if v < 10 else 1)
for v in xrange(1, 11)]
threshold = self.alarms[1].rule['threshold']
maxs = [self._get_stat('max',
threshold - (v if v < 7 else -v),
count=20 if v < 7 else 1)
for v in xrange(8)]
self.api_client.statistics.list.side_effect = [avgs, maxs]
self._evaluate_all_alarms()
self._assert_all_alarms('alarm' if exclude_outliers else 'ok')
if exclude_outliers:
expected = [mock.call(alarm.alarm_id, state='alarm')
for alarm in self.alarms]
update_calls = self.api_client.alarms.set_state.call_args_list
self.assertEqual(update_calls, expected)
reasons = ['Transition to alarm due to 5 samples outside'
' threshold, most recent: %s' % avgs[-2].avg,
'Transition to alarm due to 4 samples outside'
' threshold, most recent: %s' % maxs[-2].max]
expected = [mock.call(alarm, 'ok', reason)
for alarm, reason in zip(self.alarms, reasons)]
self.assertEqual(self.notifier.notify.call_args_list, expected)
def test_simple_alarm_trip_with_outlier_exclusion(self):
self. _do_test_simple_alarm_trip_outlier_exclusion(True)
def test_simple_alarm_no_trip_without_outlier_exclusion(self):
self. _do_test_simple_alarm_trip_outlier_exclusion(False)
def _do_test_simple_alarm_clear_outlier_exclusion(self, exclude_outliers):
self._set_all_rules('exclude_outliers', exclude_outliers)
self._set_all_alarms('alarm')
with mock.patch('ceilometerclient.client.get_client',
return_value=self.api_client):
# most recent datapoints outside threshold but with
# anomolously low sample count
threshold = self.alarms[0].rule['threshold']
avgs = [self._get_stat('avg',
threshold - (v if v < 9 else -v),
count=20 if v < 9 else 1)
for v in xrange(10)]
threshold = self.alarms[1].rule['threshold']
maxs = [self._get_stat('max',
threshold + (v if v < 8 else -v),
count=20 if v < 8 else 1)
for v in xrange(1, 9)]
self.api_client.statistics.list.side_effect = [avgs, maxs]
self._evaluate_all_alarms()
self._assert_all_alarms('ok' if exclude_outliers else 'alarm')
if exclude_outliers:
expected = [mock.call(alarm.alarm_id, state='ok')
for alarm in self.alarms]
update_calls = self.api_client.alarms.set_state.call_args_list
self.assertEqual(update_calls, expected)
reasons = ['Transition to ok due to 5 samples inside'
' threshold, most recent: %s' % avgs[-2].avg,
'Transition to ok due to 4 samples inside'
' threshold, most recent: %s' % maxs[-2].max]
expected = [mock.call(alarm, 'alarm', reason)
for alarm, reason in zip(self.alarms, reasons)]
self.assertEqual(self.notifier.notify.call_args_list, expected)
def test_simple_alarm_clear_with_outlier_exclusion(self):
self. _do_test_simple_alarm_clear_outlier_exclusion(True)
def test_simple_alarm_no_clear_without_outlier_exclusion(self):
self. _do_test_simple_alarm_clear_outlier_exclusion(False)

View File

@ -150,6 +150,23 @@ class TestAlarms(FunctionalTest,
)]:
self.conn.update_alarm(alarm)
@staticmethod
def _add_default_threshold_rule(alarm):
if 'exclude_outliers' not in alarm['threshold_rule']:
alarm['threshold_rule']['exclude_outliers'] = False
def _verify_alarm(self, json, alarm, expected_name=None):
if expected_name and alarm.name != expected_name:
self.fail("Alarm not found")
self._add_default_threshold_rule(json)
for key in json:
if key.endswith('_rule'):
storage_key = 'rule'
else:
storage_key = key
self.assertEqual(getattr(alarm, storage_key),
json[key])
def test_list_alarms(self):
data = self.get_json('/alarms')
self.assertEqual(4, len(data))
@ -394,6 +411,7 @@ class TestAlarms(FunctionalTest,
}
}
self._add_default_threshold_rule(to_check)
json = {
'name': 'added_alarm_defaults',
@ -420,7 +438,7 @@ class TestAlarms(FunctionalTest,
else:
self.fail("Alarm not found")
def test_post_alarm(self):
def _do_test_post_alarm(self, exclude_outliers=None):
json = {
'enabled': False,
'name': 'added_alarm',
@ -443,6 +461,9 @@ class TestAlarms(FunctionalTest,
'period': '180',
}
}
if exclude_outliers is not None:
json['threshold_rule']['exclude_outliers'] = exclude_outliers
self.post_json('/alarms', params=json, status=201,
headers=self.auth_headers)
alarms = list(self.conn.get_alarms(enabled=False))
@ -453,16 +474,16 @@ class TestAlarms(FunctionalTest,
# to check to BoundedInt type conversion
json['threshold_rule']['evaluation_periods'] = 3
json['threshold_rule']['period'] = 180
if alarms[0].name == 'added_alarm':
for key in json:
if key.endswith('_rule'):
storage_key = 'rule'
else:
storage_key = key
self.assertEqual(getattr(alarms[0], storage_key),
json[key])
else:
self.fail("Alarm not found")
self._verify_alarm(json, alarms[0], 'added_alarm')
def test_post_alarm_outlier_exclusion_set(self):
self._do_test_post_alarm(True)
def test_post_alarm_outlier_exclusion_clear(self):
self._do_test_post_alarm(False)
def test_post_alarm_outlier_exclusion_defaulted(self):
self._do_test_post_alarm()
def _do_test_post_alarm_as_admin(self, explicit_project_constraint):
"""Test the creation of an alarm as admin for another project."""
@ -499,6 +520,7 @@ class TestAlarms(FunctionalTest,
self.assertEqual(1, len(alarms))
self.assertEqual(alarms[0].user_id, 'auseridthatisnotmine')
self.assertEqual(alarms[0].project_id, 'aprojectidthatisnotmine')
self._add_default_threshold_rule(json)
if alarms[0].name == 'added_alarm':
for key in json:
if key.endswith('_rule'):
@ -566,16 +588,7 @@ class TestAlarms(FunctionalTest,
self.assertEqual(1, len(alarms))
self.assertEqual(alarms[0].user_id, self.auth_headers['X-User-Id'])
self.assertEqual(alarms[0].project_id, 'aprojectidthatisnotmine')
if alarms[0].name == 'added_alarm':
for key in json:
if key.endswith('_rule'):
storage_key = 'rule'
else:
storage_key = key
self.assertEqual(getattr(alarms[0], storage_key),
json[key])
else:
self.fail("Alarm not found")
self._verify_alarm(json, alarms[0], 'added_alarm')
def test_post_alarm_as_admin_no_project(self):
"""Test the creation of an alarm as admin for another project but
@ -612,16 +625,7 @@ class TestAlarms(FunctionalTest,
self.assertEqual(alarms[0].user_id, 'auseridthatisnotmine')
self.assertEqual(alarms[0].project_id,
self.auth_headers['X-Project-Id'])
if alarms[0].name == 'added_alarm':
for key in json:
if key.endswith('_rule'):
storage_key = 'rule'
else:
storage_key = key
self.assertEqual(getattr(alarms[0], storage_key),
json[key])
else:
self.fail("Alarm not found")
self._verify_alarm(json, alarms[0], 'added_alarm')
def test_post_alarm_combination(self):
json = {
@ -818,12 +822,7 @@ class TestAlarms(FunctionalTest,
json['threshold_rule']['query'].append({
'field': 'project_id', 'op': 'eq',
'value': self.auth_headers['X-Project-Id']})
for key in json:
if key.endswith('_rule'):
storage_key = 'rule'
else:
storage_key = key
self.assertEqual(getattr(alarm, storage_key), json[key])
self._verify_alarm(json, alarm)
def test_put_alarm_as_admin(self):
json = {
@ -870,12 +869,7 @@ class TestAlarms(FunctionalTest,
alarm = list(self.conn.get_alarms(alarm_id=alarm_id, enabled=False))[0]
self.assertEqual(alarm.user_id, 'myuserid')
self.assertEqual(alarm.project_id, 'myprojectid')
for key in json:
if key.endswith('_rule'):
storage_key = 'rule'
else:
storage_key = key
self.assertEqual(getattr(alarm, storage_key), json[key])
self._verify_alarm(json, alarm)
def test_put_alarm_wrong_field(self):
# Note: wsme will ignore unknown fields so will just not appear in
@ -1046,6 +1040,7 @@ class TestAlarms(FunctionalTest,
type='creation',
user_id=alarm['user_id']),
history[0])
self._add_default_threshold_rule(new_alarm)
new_alarm['rule'] = new_alarm['threshold_rule']
del new_alarm['threshold_rule']
new_alarm['rule']['query'].append({
@ -1118,6 +1113,7 @@ class TestAlarms(FunctionalTest,
data = dict(state='alarm')
self._update_alarm(alarm, data, auth_headers=admin_auth)
self._add_default_threshold_rule(new_alarm)
new_alarm['rule'] = new_alarm['threshold_rule']
del new_alarm['threshold_rule']