Merge "Exclude weak datapoints from alarm threshold evaluation"

2014-01-24 02:26:44 +00:00 · 2014-01-24 02:26:44 +00:00 · 447573f8c9
commit 447573f8c9
parent 72857ef4f9 a4c7411ac9
5 changed files with 240 additions and 61 deletions
--- a/ceilometer/alarm/evaluator/threshold.py
+++ b/ceilometer/alarm/evaluator/threshold.py
@ -21,6 +21,7 @@ import datetime
 import operator

 from ceilometer.alarm import evaluator
+from ceilometer.alarm.evaluator import utils
 from ceilometer.openstack.common.gettextutils import _  # noqa
 from ceilometer.openstack.common import log
 from ceilometer.openstack.common import timeutils
@ -51,8 +52,13 @@ class ThresholdEvaluator(evaluator.Evaluator):
    def _bound_duration(cls, alarm, constraints):
        """Bound the duration of the statistics query."""
        now = timeutils.utcnow()
+        # when exclusion of weak datapoints is enabled, we extend
+        # the look-back period so as to allow a clearer sample count
+        # trend to be established
+        look_back = (cls.look_back if not alarm.rule.get('exclude_outliers')
+                     else alarm.rule['evaluation_periods'])
        window = (alarm.rule['period'] *
-                  (alarm.rule['evaluation_periods'] + cls.look_back))
+                  (alarm.rule['evaluation_periods'] + look_back))
        start = now - datetime.timedelta(seconds=window)
        LOG.debug(_('query stats from %(start)s to '
                    '%(now)s') % {'start': start, 'now': now})
@ -64,13 +70,25 @@ class ThresholdEvaluator(evaluator.Evaluator):
    @staticmethod
    def _sanitize(alarm, statistics):
        """Sanitize statistics.
-           Ultimately this will be the hook for the exclusion of chaotic
-           datapoints for example.
        """
        LOG.debug(_('sanitize stats %s') % statistics)
+        if alarm.rule.get('exclude_outliers'):
+            key = operator.attrgetter('count')
+            mean = utils.mean(statistics, key)
+            stddev = utils.stddev(statistics, key, mean)
+            lower = mean - 2 * stddev
+            upper = mean + 2 * stddev
+            inliers, outliers = utils.anomolies(statistics, key, lower, upper)
+            if outliers:
+                LOG.debug(_('excluded weak datapoints with sample counts %s'),
+                          [s.count for s in outliers])
+                statistics = inliers
+            else:
+                LOG.debug('no excluded weak datapoints')
+
        # in practice statistics are always sorted by period start, not
        # strictly required by the API though
-        statistics = statistics[:alarm.rule['evaluation_periods']]
+        statistics = statistics[-alarm.rule['evaluation_periods']:]
        LOG.debug(_('pruned statistics to %d') % len(statistics))
        return statistics

--- a/ceilometer/alarm/evaluator/utils.py
+++ b/ceilometer/alarm/evaluator/utils.py
@ -0,0 +1,67 @@
+# -*- encoding: utf-8 -*-
+#
+# Copyright © 2014 Red Hat, Inc
+#
+# Author: Eoghan Glynn <eglynn@redhat.com>
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import math
+
+
+def mean(s, key=lambda x: x):
+    """Calculate the mean of a numeric list.
+    """
+    count = float(len(s))
+    if count:
+        return math.fsum(map(key, s)) / count
+    return 0.0
+
+
+def deltas(s, key, m=None):
+    """Calculate the squared distances from mean for a numeric list.
+    """
+    m = m or mean(s, key)
+    return [(key(i) - m) ** 2 for i in s]
+
+
+def variance(s, key, m=None):
+    """Calculate the variance of a numeric list.
+    """
+    return mean(deltas(s, key, m))
+
+
+def stddev(s, key, m=None):
+    """Calculate the standard deviation of a numeric list.
+    """
+    return math.sqrt(variance(s, key, m))
+
+
+def outside(s, key, lower=0.0, upper=0.0):
+    """Determine if value falls outside upper and lower bounds.
+    """
+    v = key(s)
+    return v < lower or v > upper
+
+
+def anomolies(s, key, lower=0.0, upper=0.0):
+    """Separate anomolous datapoints from the inliers.
+    """
+    inliers = []
+    outliers = []
+    for i in s:
+        if outside(i, key, lower, upper):
+            outliers.append(i)
+        else:
+            inliers.append(i)
+    return inliers, outliers
--- a/ceilometer/api/controllers/v2.py
+++ b/ceilometer/api/controllers/v2.py
@ -1111,6 +1111,9 @@ class AlarmThresholdRule(_Base):
    evaluation_periods = wsme.wsattr(BoundedInt(min=1), default=1)
    "The number of historical periods to evaluate the threshold"

+    exclude_outliers = wsme.wsattr(bool, default=False)
+    "Whether datapoints with anomolously low sample counts are excluded"
+
    def __init__(self, query=None, **kwargs):
        if query:
            query = [Query(**q) for q in query]
@ -1143,7 +1146,8 @@ class AlarmThresholdRule(_Base):
    def as_dict(self):
        rule = self.as_dict_from_keys(['period', 'comparison_operator',
                                       'threshold', 'statistic',
-                                       'evaluation_periods', 'meter_name'])
+                                       'evaluation_periods', 'meter_name',
+                                       'exclude_outliers'])
        rule['query'] = [q.as_dict() for q in self.query]
        return rule

--- a/ceilometer/tests/alarm/evaluator/test_threshold.py
+++ b/ceilometer/tests/alarm/evaluator/test_threshold.py
@ -96,8 +96,12 @@ class TestEvaluate(base.TestEvaluatorBase):
        ]

    @staticmethod
-    def _get_stat(attr, value):
-        return statistics.Statistics(None, {attr: value})
+    def _get_stat(attr, value, count=1):
+        return statistics.Statistics(None, {attr: value, 'count': count})
+
+    def _set_all_rules(self, field, value):
+        for alarm in self.alarms:
+            alarm.rule[field] = value

    def test_retry_transient_api_failure(self):
        with mock.patch('ceilometerclient.client.get_client',
@ -150,9 +154,9 @@ class TestEvaluate(base.TestEvaluatorBase):
            update_calls = self.api_client.alarms.set_state.call_args_list
            self.assertEqual(update_calls, expected)
            reasons = ['Transition to alarm due to 5 samples outside'
-                       ' threshold, most recent: 85.0',
+                       ' threshold, most recent: %s' % avgs[-1].avg,
                       'Transition to alarm due to 4 samples outside'
-                       ' threshold, most recent: 7.0']
+                       ' threshold, most recent: %s' % maxs[-1].max]
            expected = [mock.call(alarm, 'ok', reason)
                        for alarm, reason in zip(self.alarms, reasons)]
            self.assertEqual(self.notifier.notify.call_args_list, expected)
@ -173,9 +177,9 @@ class TestEvaluate(base.TestEvaluatorBase):
            update_calls = self.api_client.alarms.set_state.call_args_list
            self.assertEqual(update_calls, expected)
            reasons = ['Transition to ok due to 5 samples inside'
-                       ' threshold, most recent: 76.0',
+                       ' threshold, most recent: %s' % avgs[-1].avg,
                       'Transition to ok due to 4 samples inside'
-                       ' threshold, most recent: 14.0']
+                       ' threshold, most recent: %s' % maxs[-1].max]
            expected = [mock.call(alarm, 'alarm', reason)
                        for alarm, reason in zip(self.alarms, reasons)]
            self.assertEqual(self.notifier.notify.call_args_list, expected)
@ -251,9 +255,9 @@ class TestEvaluate(base.TestEvaluatorBase):
            update_calls = self.api_client.alarms.set_state.call_args_list
            self.assertEqual(update_calls, expected)
            reasons = ['Transition to alarm due to 5 samples outside'
-                       ' threshold, most recent: 85.0',
+                       ' threshold, most recent: %s' % avgs[-1].avg,
                       'Transition to alarm due to 4 samples outside'
-                       ' threshold, most recent: 7.0']
+                       ' threshold, most recent: %s' % maxs[-1].max]
            expected = [mock.call(alarm, 'ok', reason)
                        for alarm, reason in zip(self.alarms, reasons)]
            self.assertEqual(self.notifier.notify.call_args_list, expected)
@ -274,25 +278,37 @@ class TestEvaluate(base.TestEvaluatorBase):
            update_calls = self.api_client.alarms.set_state.call_args_list
            self.assertEqual(update_calls, expected)
            reasons = ['Transition to alarm due to 5 samples outside'
-                       ' threshold, most recent: 85.0',
+                       ' threshold, most recent: %s' % avgs[-1].avg,
                       'Transition to alarm due to 4 samples outside'
-                       ' threshold, most recent: 7.0']
+                       ' threshold, most recent: %s' % maxs[-1].max]
            expected = [mock.call(alarm, 'insufficient data', reason)
                        for alarm, reason in zip(self.alarms, reasons)]
            self.assertEqual(self.notifier.notify.call_args_list, expected)

-    def test_bound_duration(self):
+    def _do_test_bound_duration(self, start, exclude_outliers=None):
+        alarm = self.alarms[0]
+        if exclude_outliers is not None:
+            alarm.rule['exclude_outliers'] = exclude_outliers
        timeutils.utcnow.override_time = datetime.datetime(2012, 7, 2, 10, 45)
-        constraint = self.evaluator._bound_duration(self.alarms[0], [])
+        constraint = self.evaluator._bound_duration(alarm, [])
        self.assertEqual(constraint, [
            {'field': 'timestamp',
             'op': 'le',
             'value': timeutils.utcnow().isoformat()},
            {'field': 'timestamp',
             'op': 'ge',
-             'value': '2012-07-02T10:39:00'},
+             'value': start},
        ])

+    def test_bound_duration_outlier_exclusion_defaulted(self):
+        self._do_test_bound_duration('2012-07-02T10:39:00')
+
+    def test_bound_duration_outlier_exclusion_clear(self):
+        self._do_test_bound_duration('2012-07-02T10:39:00', False)
+
+    def test_bound_duration_outlier_exclusion_set(self):
+        self._do_test_bound_duration('2012-07-02T10:35:00', True)
+
    def test_threshold_endpoint_types(self):
        endpoint_types = ["internalURL", "publicURL"]
        for endpoint_type in endpoint_types:
@ -313,3 +329,81 @@ class TestEvaluate(base.TestEvaluatorBase):
                                      os_endpoint_type=conf.os_endpoint_type)]
                actual = client.call_args_list
                self.assertEqual(actual, expected)
+
+    def _do_test_simple_alarm_trip_outlier_exclusion(self, exclude_outliers):
+        self._set_all_rules('exclude_outliers', exclude_outliers)
+        self._set_all_alarms('ok')
+        with mock.patch('ceilometerclient.client.get_client',
+                        return_value=self.api_client):
+            # most recent datapoints inside threshold but with
+            # anomolously low sample count
+            threshold = self.alarms[0].rule['threshold']
+            avgs = [self._get_stat('avg',
+                                   threshold + (v if v < 10 else -v),
+                                   count=20 if v < 10 else 1)
+                    for v in xrange(1, 11)]
+            threshold = self.alarms[1].rule['threshold']
+            maxs = [self._get_stat('max',
+                                   threshold - (v if v < 7 else -v),
+                                   count=20 if v < 7 else 1)
+                    for v in xrange(8)]
+            self.api_client.statistics.list.side_effect = [avgs, maxs]
+            self._evaluate_all_alarms()
+            self._assert_all_alarms('alarm' if exclude_outliers else 'ok')
+            if exclude_outliers:
+                expected = [mock.call(alarm.alarm_id, state='alarm')
+                            for alarm in self.alarms]
+                update_calls = self.api_client.alarms.set_state.call_args_list
+                self.assertEqual(update_calls, expected)
+                reasons = ['Transition to alarm due to 5 samples outside'
+                           ' threshold, most recent: %s' % avgs[-2].avg,
+                           'Transition to alarm due to 4 samples outside'
+                           ' threshold, most recent: %s' % maxs[-2].max]
+                expected = [mock.call(alarm, 'ok', reason)
+                            for alarm, reason in zip(self.alarms, reasons)]
+                self.assertEqual(self.notifier.notify.call_args_list, expected)
+
+    def test_simple_alarm_trip_with_outlier_exclusion(self):
+        self. _do_test_simple_alarm_trip_outlier_exclusion(True)
+
+    def test_simple_alarm_no_trip_without_outlier_exclusion(self):
+        self. _do_test_simple_alarm_trip_outlier_exclusion(False)
+
+    def _do_test_simple_alarm_clear_outlier_exclusion(self, exclude_outliers):
+        self._set_all_rules('exclude_outliers', exclude_outliers)
+        self._set_all_alarms('alarm')
+        with mock.patch('ceilometerclient.client.get_client',
+                        return_value=self.api_client):
+            # most recent datapoints outside threshold but with
+            # anomolously low sample count
+            threshold = self.alarms[0].rule['threshold']
+            avgs = [self._get_stat('avg',
+                                   threshold - (v if v < 9 else -v),
+                                   count=20 if v < 9 else 1)
+                    for v in xrange(10)]
+            threshold = self.alarms[1].rule['threshold']
+            maxs = [self._get_stat('max',
+                                   threshold + (v if v < 8 else -v),
+                                   count=20 if v < 8 else 1)
+                    for v in xrange(1, 9)]
+            self.api_client.statistics.list.side_effect = [avgs, maxs]
+            self._evaluate_all_alarms()
+            self._assert_all_alarms('ok' if exclude_outliers else 'alarm')
+            if exclude_outliers:
+                expected = [mock.call(alarm.alarm_id, state='ok')
+                            for alarm in self.alarms]
+                update_calls = self.api_client.alarms.set_state.call_args_list
+                self.assertEqual(update_calls, expected)
+                reasons = ['Transition to ok due to 5 samples inside'
+                           ' threshold, most recent: %s' % avgs[-2].avg,
+                           'Transition to ok due to 4 samples inside'
+                           ' threshold, most recent: %s' % maxs[-2].max]
+                expected = [mock.call(alarm, 'alarm', reason)
+                            for alarm, reason in zip(self.alarms, reasons)]
+                self.assertEqual(self.notifier.notify.call_args_list, expected)
+
+    def test_simple_alarm_clear_with_outlier_exclusion(self):
+        self. _do_test_simple_alarm_clear_outlier_exclusion(True)
+
+    def test_simple_alarm_no_clear_without_outlier_exclusion(self):
+        self. _do_test_simple_alarm_clear_outlier_exclusion(False)
--- a/ceilometer/tests/api/v2/test_alarm_scenarios.py
+++ b/ceilometer/tests/api/v2/test_alarm_scenarios.py
@ -150,6 +150,23 @@ class TestAlarms(FunctionalTest,
                         )]:
            self.conn.update_alarm(alarm)

+    @staticmethod
+    def _add_default_threshold_rule(alarm):
+        if 'exclude_outliers' not in alarm['threshold_rule']:
+            alarm['threshold_rule']['exclude_outliers'] = False
+
+    def _verify_alarm(self, json, alarm, expected_name=None):
+        if expected_name and alarm.name != expected_name:
+            self.fail("Alarm not found")
+        self._add_default_threshold_rule(json)
+        for key in json:
+            if key.endswith('_rule'):
+                storage_key = 'rule'
+            else:
+                storage_key = key
+            self.assertEqual(getattr(alarm, storage_key),
+                             json[key])
+
    def test_list_alarms(self):
        data = self.get_json('/alarms')
        self.assertEqual(4, len(data))
@ -394,6 +411,7 @@ class TestAlarms(FunctionalTest,
            }

        }
+        self._add_default_threshold_rule(to_check)

        json = {
            'name': 'added_alarm_defaults',
@ -420,7 +438,7 @@ class TestAlarms(FunctionalTest,
        else:
            self.fail("Alarm not found")

-    def test_post_alarm(self):
+    def _do_test_post_alarm(self, exclude_outliers=None):
        json = {
            'enabled': False,
            'name': 'added_alarm',
@ -443,6 +461,9 @@ class TestAlarms(FunctionalTest,
                'period': '180',
            }
        }
+        if exclude_outliers is not None:
+            json['threshold_rule']['exclude_outliers'] = exclude_outliers
+
        self.post_json('/alarms', params=json, status=201,
                       headers=self.auth_headers)
        alarms = list(self.conn.get_alarms(enabled=False))
@ -453,16 +474,16 @@ class TestAlarms(FunctionalTest,
        # to check to BoundedInt type conversion
        json['threshold_rule']['evaluation_periods'] = 3
        json['threshold_rule']['period'] = 180
-        if alarms[0].name == 'added_alarm':
-            for key in json:
-                if key.endswith('_rule'):
-                    storage_key = 'rule'
-                else:
-                    storage_key = key
-                self.assertEqual(getattr(alarms[0], storage_key),
-                                 json[key])
-        else:
-            self.fail("Alarm not found")
+        self._verify_alarm(json, alarms[0], 'added_alarm')
+
+    def test_post_alarm_outlier_exclusion_set(self):
+        self._do_test_post_alarm(True)
+
+    def test_post_alarm_outlier_exclusion_clear(self):
+        self._do_test_post_alarm(False)
+
+    def test_post_alarm_outlier_exclusion_defaulted(self):
+        self._do_test_post_alarm()

    def _do_test_post_alarm_as_admin(self, explicit_project_constraint):
        """Test the creation of an alarm as admin for another project."""
@ -499,6 +520,7 @@ class TestAlarms(FunctionalTest,
        self.assertEqual(1, len(alarms))
        self.assertEqual(alarms[0].user_id, 'auseridthatisnotmine')
        self.assertEqual(alarms[0].project_id, 'aprojectidthatisnotmine')
+        self._add_default_threshold_rule(json)
        if alarms[0].name == 'added_alarm':
            for key in json:
                if key.endswith('_rule'):
@ -566,16 +588,7 @@ class TestAlarms(FunctionalTest,
        self.assertEqual(1, len(alarms))
        self.assertEqual(alarms[0].user_id, self.auth_headers['X-User-Id'])
        self.assertEqual(alarms[0].project_id, 'aprojectidthatisnotmine')
-        if alarms[0].name == 'added_alarm':
-            for key in json:
-                if key.endswith('_rule'):
-                    storage_key = 'rule'
-                else:
-                    storage_key = key
-                self.assertEqual(getattr(alarms[0], storage_key),
-                                 json[key])
-        else:
-            self.fail("Alarm not found")
+        self._verify_alarm(json, alarms[0], 'added_alarm')

    def test_post_alarm_as_admin_no_project(self):
        """Test the creation of an alarm as admin for another project but
@ -612,16 +625,7 @@ class TestAlarms(FunctionalTest,
        self.assertEqual(alarms[0].user_id, 'auseridthatisnotmine')
        self.assertEqual(alarms[0].project_id,
                         self.auth_headers['X-Project-Id'])
-        if alarms[0].name == 'added_alarm':
-            for key in json:
-                if key.endswith('_rule'):
-                    storage_key = 'rule'
-                else:
-                    storage_key = key
-                self.assertEqual(getattr(alarms[0], storage_key),
-                                 json[key])
-        else:
-            self.fail("Alarm not found")
+        self._verify_alarm(json, alarms[0], 'added_alarm')

    def test_post_alarm_combination(self):
        json = {
@ -818,12 +822,7 @@ class TestAlarms(FunctionalTest,
        json['threshold_rule']['query'].append({
            'field': 'project_id', 'op': 'eq',
            'value': self.auth_headers['X-Project-Id']})
-        for key in json:
-            if key.endswith('_rule'):
-                storage_key = 'rule'
-            else:
-                storage_key = key
-            self.assertEqual(getattr(alarm, storage_key), json[key])
+        self._verify_alarm(json, alarm)

    def test_put_alarm_as_admin(self):
        json = {
@ -870,12 +869,7 @@ class TestAlarms(FunctionalTest,
        alarm = list(self.conn.get_alarms(alarm_id=alarm_id, enabled=False))[0]
        self.assertEqual(alarm.user_id, 'myuserid')
        self.assertEqual(alarm.project_id, 'myprojectid')
-        for key in json:
-            if key.endswith('_rule'):
-                storage_key = 'rule'
-            else:
-                storage_key = key
-            self.assertEqual(getattr(alarm, storage_key), json[key])
+        self._verify_alarm(json, alarm)

    def test_put_alarm_wrong_field(self):
        # Note: wsme will ignore unknown fields so will just not appear in
@ -1046,6 +1040,7 @@ class TestAlarms(FunctionalTest,
                                    type='creation',
                                    user_id=alarm['user_id']),
                               history[0])
+        self._add_default_threshold_rule(new_alarm)
        new_alarm['rule'] = new_alarm['threshold_rule']
        del new_alarm['threshold_rule']
        new_alarm['rule']['query'].append({
@ -1118,6 +1113,7 @@ class TestAlarms(FunctionalTest,
        data = dict(state='alarm')
        self._update_alarm(alarm, data, auth_headers=admin_auth)

+        self._add_default_threshold_rule(new_alarm)
        new_alarm['rule'] = new_alarm['threshold_rule']
        del new_alarm['threshold_rule']