Merge "Add Prometheus evaluator"

2023-11-07 16:02:59 +00:00 · 2023-11-07 16:02:59 +00:00 · 197440c3c8
commit 197440c3c8
parent 99df2a40bf f932265290
10 changed files with 226 additions and 25 deletions
--- a/aodh/api/controllers/v2/alarm_rules/composite.py
+++ b/aodh/api/controllers/v2/alarm_rules/composite.py
@ -41,7 +41,8 @@ class CompositeRule(wtypes.UserType):
    threshold_plugins = None
    def __init__(self):
-        threshold_rules = ('gnocchi_resources_threshold',
+        threshold_rules = ('prometheus',
                           'gnocchi_resources_threshold',
                           'gnocchi_aggregation_by_metrics_threshold',
                           'gnocchi_aggregation_by_resources_threshold')
        CompositeRule.threshold_plugins = named.NamedExtensionManager(
--- a/aodh/api/controllers/v2/alarm_rules/prometheus.py
+++ b/aodh/api/controllers/v2/alarm_rules/prometheus.py
@ -0,0 +1,46 @@
 #
 # Copyright 2023 Red Hat, Inc
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may
 # not use this file except in compliance with the License. You may obtain
 # a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 # License for the specific language governing permissions and limitations
 # under the License.
 from oslo_log import log
 import wsme
 from wsme import types as wtypes
 from aodh.api.controllers.v2 import base
 LOG = log.getLogger(__name__)
 class PrometheusRule(base.AlarmRule):
    comparison_operator = base.AdvEnum('comparison_operator', str,
                                       'lt', 'le', 'eq', 'ne', 'ge', 'gt',
                                       default='eq')
    "The comparison against the alarm threshold"
    threshold = wsme.wsattr(float, mandatory=True)
    "The threshold of the alarm"
    query = wsme.wsattr(wtypes.text, mandatory=True)
    "The Prometheus query"
    @staticmethod
    def validate(rule):
        # TO-DO(mmagr): validate Prometheus query maybe?
        return rule
    def as_dict(self):
        rule = self.as_dict_from_keys(['comparison_operator', 'threshold',
                                       'query'])
        return rule
--- a/aodh/evaluator/composite.py
+++ b/aodh/evaluator/composite.py
@ -116,7 +116,8 @@ class CompositeEvaluator(evaluator.Evaluator):
    @property
    def threshold_evaluators(self):
        if not self._threshold_evaluators:
-            threshold_types = ('gnocchi_resources_threshold',
+            threshold_types = ('prometheus',
                               'gnocchi_resources_threshold',
                               'gnocchi_aggregation_by_metrics_threshold',
                               'gnocchi_aggregation_by_resources_threshold')
            self._threshold_evaluators = stevedore.NamedExtensionManager(
--- a/aodh/evaluator/prometheus.py
+++ b/aodh/evaluator/prometheus.py
@ -0,0 +1,78 @@
 #
 # Copyright 2023 Red Hat, Inc
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may
 # not use this file except in compliance with the License. You may obtain
 # a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 # License for the specific language governing permissions and limitations
 # under the License.
 from oslo_config import cfg
 from oslo_log import log
 from observabilityclient import client
 from aodh.evaluator import threshold
 from aodh import keystone_client
 LOG = log.getLogger(__name__)
 OPTS = [
    cfg.BoolOpt('prometheus_disable_rbac',
                default=False,
                help='Disable RBAC for Prometheus evaluator.'),
 ]
 class PrometheusBase(threshold.ThresholdEvaluator):
    def __init__(self, conf):
        super(PrometheusBase, self).__init__(conf)
        self._set_obsclient(conf)
        self._no_rbac = conf.prometheus_disable_rbac
    def _set_obsclient(self, conf):
        session = keystone_client.get_session(conf)
        opts = {'interface': conf.service_credentials.interface,
                'region_name': conf.service_credentials.region_name}
        self._prom = client.Client('1', session, adapter_options=opts)
    def _get_metric_data(self, query):
        LOG.debug(f'Querying Prometheus instance on: {query}')
        return self._prom.query.query(query, disable_rbac=self._no_rbac)
 class PrometheusEvaluator(PrometheusBase):
    def _sanitize(self, metric_data):
        sanitized = [float(m.value) for m in metric_data]
        LOG.debug(f'Sanited Prometheus metric data: {metric_data}'
                  f' to statistics: {sanitized}')
        return sanitized
    def evaluate_rule(self, alarm_rule):
        """Evaluate alarm rule.
        :returns: state, trending state, statistics, number of samples outside
        threshold and reason
        """
        metrics = self._get_metric_data(alarm_rule['query'])
        if not metrics:
            LOG.warning("Empty result fetched from Prometheus for query"
                        f" {alarm_rule['query']}")
        statistics = self._sanitize(metrics)
        if not statistics:
            raise threshold.InsufficientDataError('datapoints are unknown',
                                                  statistics)
        return self._process_statistics(alarm_rule, statistics)
    def _unknown_reason_data(self, alarm, statistics):
        LOG.warning(f'Transfering alarm {alarm} on unknown reason')
        last = None if not statistics else statistics[-1]
        return self._reason_data('unknown', len(statistics), last)
--- a/aodh/evaluator/threshold.py
+++ b/aodh/evaluator/threshold.py
@ -96,19 +96,7 @@ class ThresholdEvaluator(evaluator.Evaluator):
                ' %(disposition)s threshold, most recent: %(most_recent)s'
                % dict(reason_data, state=state), reason_data)
-    def evaluate_rule(self, alarm_rule):
+    def _process_statistics(self, alarm_rule, statistics):
        """Evaluate alarm rule.
        :returns: state, trending state and statistics.
        """
        start, end = self._bound_duration(alarm_rule)
        statistics = self._statistics(alarm_rule, start, end)
        statistics = self._sanitize(alarm_rule, statistics)
        sufficient = len(statistics) >= alarm_rule['evaluation_periods']
        if not sufficient:
            raise InsufficientDataError(
                '%d datapoints are unknown' % alarm_rule['evaluation_periods'],
                statistics)
        def _compare(value):
            op = COMPARATORS[alarm_rule['comparison_operator']]
@ -129,6 +117,31 @@ class ThresholdEvaluator(evaluator.Evaluator):
            trending_state = evaluator.ALARM if compared[-1] else evaluator.OK
            return None, trending_state, statistics, number_outside, None
    def evaluate_rule(self, alarm_rule):
        """Evaluate alarm rule.
        :returns: state, trending state and statistics.
        """
        start, end = self._bound_duration(alarm_rule)
        statistics = self._statistics(alarm_rule, start, end)
        statistics = self._sanitize(alarm_rule, statistics)
        sufficient = len(statistics) >= alarm_rule['evaluation_periods']
        if not sufficient:
            raise InsufficientDataError(
                '%d datapoints are unknown' % alarm_rule['evaluation_periods'],
                statistics)
        return self._process_statistics(alarm_rule, statistics)
    def _unknown_reason_data(self, alarm, statistics):
        LOG.warning(f'Expecting {alarm.rule["evaluation_periods"]} datapoints'
                    f' but only get {len(statistics)}')
        # Reason is not same as log message because we want to keep
        # consistent since thirdparty software may depend on old format.
        last = None if not statistics else statistics[-1]
        return self._reason_data('unknown', alarm.rule['evaluation_periods'],
                                 last)
    def _transition_alarm(self, alarm, state, trending_state, statistics,
                          outside_count, unknown_reason):
        unknown = alarm.state == evaluator.UNKNOWN
@ -143,16 +156,7 @@ class ThresholdEvaluator(evaluator.Evaluator):
                return
        if state == evaluator.UNKNOWN and not unknown:
-            LOG.warning('Expecting %(expected)d datapoints but only get '
+            reason_data = self._unknown_reason_data(alarm, statistics)
                        '%(actual)d'
                        % {'expected': alarm.rule['evaluation_periods'],
                           'actual': len(statistics)})
            # Reason is not same as log message because we want to keep
            # consistent since thirdparty software may depend on old format.
            last = None if not statistics else statistics[-1]
            reason_data = self._reason_data('unknown',
                                            alarm.rule['evaluation_periods'],
                                            last)
            self._refresh(alarm, state, unknown_reason, reason_data)
        elif state and (alarm.state != state or continuous):
--- a/aodh/opts.py
+++ b/aodh/opts.py
@ -23,6 +23,7 @@ import aodh.evaluator
 import aodh.evaluator.event
 import aodh.evaluator.gnocchi
 import aodh.evaluator.loadbalancer
 import aodh.evaluator.prometheus
 import aodh.evaluator.threshold
 import aodh.event
 import aodh.keystone_client
@ -38,6 +39,7 @@ def list_opts():
         itertools.chain(
             aodh.evaluator.OPTS,
             aodh.evaluator.event.OPTS,
             aodh.evaluator.prometheus.OPTS,
             aodh.evaluator.threshold.OPTS,
             aodh.evaluator.loadbalancer.OPTS,
             aodh.notifier.rest.OPTS,
--- a/aodh/tests/unit/evaluator/test_composite.py
+++ b/aodh/tests/unit/evaluator/test_composite.py
@ -15,6 +15,7 @@
 from unittest import mock
 import fixtures
 import os
 from oslo_utils import timeutils
 from oslo_utils import uuidutils
@ -25,6 +26,12 @@ from aodh.tests import constants
 from aodh.tests.unit.evaluator import base
 # NOTE(mmagr): Overriding PrometheusEvaluator setting to avoid
 # complains during init.
 os.environ['PROMETHEUS_HOST'] = '127.0.0.1'
 os.environ['PROMETHEUS_PORT'] = '666'
 class BaseCompositeEvaluate(base.TestEvaluatorBase):
    EVALUATOR = composite.CompositeEvaluator
--- a/aodh/tests/unit/test_evaluator.py
+++ b/aodh/tests/unit/test_evaluator.py
@ -18,11 +18,14 @@ import fixtures
 import time
 from unittest import mock
 from observabilityclient import prometheus_client
 from oslo_config import fixture as fixture_config
 from stevedore import extension
 from aodh import evaluator
 from aodh import service
 from aodh.evaluator import prometheus
 from aodh.tests import base as tests_base
@ -190,3 +193,59 @@ class TestAlarmEvaluationService(tests_base.BaseTestCase):
        target = svc.partition_coordinator.extract_my_subset
        self.assertEqual(0, target.call_count)
        self.assertEqual(0, self.threshold_eval.evaluate.call_count)
 class TestPrometheusEvaluator(tests_base.BaseTestCase):
    def setUp(self):
        super(TestPrometheusEvaluator, self).setUp()
        conf = service.prepare_service(argv=[], config_files=[])
        self.CONF = self.useFixture(fixture_config.Config(conf)).conf
    def test_rule_evaluation(self):
        metric_list = [
            prometheus_client.PrometheusMetric({'metric': 'mtr',
                                                'value': (0, 10)}),
            prometheus_client.PrometheusMetric({'metric': 'mtr',
                                                'value': (1, 15)}),
            prometheus_client.PrometheusMetric({'metric': 'mtr',
                                                'value': (2, 20)}),
            prometheus_client.PrometheusMetric({'metric': 'mtr',
                                                'value': (3, 25)}),
            prometheus_client.PrometheusMetric({'metric': 'mtr',
                                                'value': (4, 30)}),
            prometheus_client.PrometheusMetric({'metric': 'mtr',
                                                'value': (5, 15)}),
        ]
        with mock.patch.object(prometheus.PrometheusEvaluator,
                               '_set_obsclient', return_value=None):
            # mock Prometheus client
            ev = prometheus.PrometheusEvaluator(self.CONF)
            ev._get_metric_data = mock.Mock(return_value=metric_list)
            # test transfer to alarm state
            state, trend, stats, outside, reason = ev.evaluate_rule(
                {'query': 'mtr', 'threshold': 9,
                 'comparison_operator': 'gt'})
            self.assertEqual('alarm', state)
            self.assertEqual(6, outside)
            # test transfer to ok state
            state, trend, stats, outside, reason = ev.evaluate_rule(
                {'query': 'mtr', 'threshold': 31,
                 'comparison_operator': 'gt'})
            self.assertEqual('ok', state)
            self.assertEqual(0, outside)
            # test trending to alarm state
            state, trend, stats, outside, reason = ev.evaluate_rule(
                {'query': 'mtr', 'threshold': 14,
                 'comparison_operator': 'gt'})
            self.assertEqual('alarm', trend)
            self.assertEqual(5, outside)
            # test trending to ok state
            state, trend, stats, outside, reason = ev.evaluate_rule(
                {'query': 'mtr', 'threshold': 20,
                 'comparison_operator': 'gt'})
            self.assertEqual('ok', trend)
            self.assertEqual(2, outside)
--- a/requirements.txt
+++ b/requirements.txt
@ -36,6 +36,7 @@ cachetools>=1.1.6
 cotyledon>=1.7.3
 keystoneauth1>=2.1
 debtcollector>=1.2.0  # Apache-2.0
 python-observabilityclient>=0.0.4
 python-octaviaclient>=1.8.0
 python-dateutil>=2.8.2  # BSD
 python-heatclient>=1.17.0
--- a/setup.cfg
+++ b/setup.cfg
@ -57,6 +57,7 @@ aodh.alarm.rule =
    event = aodh.api.controllers.v2.alarm_rules.event:AlarmEventRule
    composite = aodh.api.controllers.v2.alarm_rules.composite:composite_rule
    loadbalancer_member_health = aodh.api.controllers.v2.alarm_rules.loadbalancer:LoadBalancerMemberHealthRule
    prometheus = aodh.api.controllers.v2.alarm_rules.prometheus:PrometheusRule
 aodh.evaluator =
    gnocchi_resources_threshold = aodh.evaluator.gnocchi:GnocchiResourceThresholdEvaluator
@ -64,6 +65,7 @@ aodh.evaluator =
    gnocchi_aggregation_by_resources_threshold = aodh.evaluator.gnocchi:GnocchiAggregationResourcesThresholdEvaluator
    composite = aodh.evaluator.composite:CompositeEvaluator
    loadbalancer_member_health = aodh.evaluator.loadbalancer:LoadBalancerMemberHealthEvaluator
    prometheus = aodh.evaluator.prometheus:PrometheusEvaluator
 aodh.notifier =
    log = aodh.notifier.log:LogAlarmNotifier