Merge "Add Prometheus evaluator"

This commit is contained in:
Zuul 2023-11-07 16:02:59 +00:00 committed by Gerrit Code Review
commit 197440c3c8
10 changed files with 226 additions and 25 deletions

View File

@ -41,7 +41,8 @@ class CompositeRule(wtypes.UserType):
threshold_plugins = None threshold_plugins = None
def __init__(self): def __init__(self):
threshold_rules = ('gnocchi_resources_threshold', threshold_rules = ('prometheus',
'gnocchi_resources_threshold',
'gnocchi_aggregation_by_metrics_threshold', 'gnocchi_aggregation_by_metrics_threshold',
'gnocchi_aggregation_by_resources_threshold') 'gnocchi_aggregation_by_resources_threshold')
CompositeRule.threshold_plugins = named.NamedExtensionManager( CompositeRule.threshold_plugins = named.NamedExtensionManager(

View File

@ -0,0 +1,46 @@
#
# Copyright 2023 Red Hat, Inc
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from oslo_log import log
import wsme
from wsme import types as wtypes
from aodh.api.controllers.v2 import base
LOG = log.getLogger(__name__)
class PrometheusRule(base.AlarmRule):
comparison_operator = base.AdvEnum('comparison_operator', str,
'lt', 'le', 'eq', 'ne', 'ge', 'gt',
default='eq')
"The comparison against the alarm threshold"
threshold = wsme.wsattr(float, mandatory=True)
"The threshold of the alarm"
query = wsme.wsattr(wtypes.text, mandatory=True)
"The Prometheus query"
@staticmethod
def validate(rule):
# TO-DO(mmagr): validate Prometheus query maybe?
return rule
def as_dict(self):
rule = self.as_dict_from_keys(['comparison_operator', 'threshold',
'query'])
return rule

View File

@ -116,7 +116,8 @@ class CompositeEvaluator(evaluator.Evaluator):
@property @property
def threshold_evaluators(self): def threshold_evaluators(self):
if not self._threshold_evaluators: if not self._threshold_evaluators:
threshold_types = ('gnocchi_resources_threshold', threshold_types = ('prometheus',
'gnocchi_resources_threshold',
'gnocchi_aggregation_by_metrics_threshold', 'gnocchi_aggregation_by_metrics_threshold',
'gnocchi_aggregation_by_resources_threshold') 'gnocchi_aggregation_by_resources_threshold')
self._threshold_evaluators = stevedore.NamedExtensionManager( self._threshold_evaluators = stevedore.NamedExtensionManager(

View File

@ -0,0 +1,78 @@
#
# Copyright 2023 Red Hat, Inc
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from oslo_config import cfg
from oslo_log import log
from observabilityclient import client
from aodh.evaluator import threshold
from aodh import keystone_client
LOG = log.getLogger(__name__)
OPTS = [
cfg.BoolOpt('prometheus_disable_rbac',
default=False,
help='Disable RBAC for Prometheus evaluator.'),
]
class PrometheusBase(threshold.ThresholdEvaluator):
def __init__(self, conf):
super(PrometheusBase, self).__init__(conf)
self._set_obsclient(conf)
self._no_rbac = conf.prometheus_disable_rbac
def _set_obsclient(self, conf):
session = keystone_client.get_session(conf)
opts = {'interface': conf.service_credentials.interface,
'region_name': conf.service_credentials.region_name}
self._prom = client.Client('1', session, adapter_options=opts)
def _get_metric_data(self, query):
LOG.debug(f'Querying Prometheus instance on: {query}')
return self._prom.query.query(query, disable_rbac=self._no_rbac)
class PrometheusEvaluator(PrometheusBase):
def _sanitize(self, metric_data):
sanitized = [float(m.value) for m in metric_data]
LOG.debug(f'Sanited Prometheus metric data: {metric_data}'
f' to statistics: {sanitized}')
return sanitized
def evaluate_rule(self, alarm_rule):
"""Evaluate alarm rule.
:returns: state, trending state, statistics, number of samples outside
threshold and reason
"""
metrics = self._get_metric_data(alarm_rule['query'])
if not metrics:
LOG.warning("Empty result fetched from Prometheus for query"
f" {alarm_rule['query']}")
statistics = self._sanitize(metrics)
if not statistics:
raise threshold.InsufficientDataError('datapoints are unknown',
statistics)
return self._process_statistics(alarm_rule, statistics)
def _unknown_reason_data(self, alarm, statistics):
LOG.warning(f'Transfering alarm {alarm} on unknown reason')
last = None if not statistics else statistics[-1]
return self._reason_data('unknown', len(statistics), last)

View File

@ -96,19 +96,7 @@ class ThresholdEvaluator(evaluator.Evaluator):
' %(disposition)s threshold, most recent: %(most_recent)s' ' %(disposition)s threshold, most recent: %(most_recent)s'
% dict(reason_data, state=state), reason_data) % dict(reason_data, state=state), reason_data)
def evaluate_rule(self, alarm_rule): def _process_statistics(self, alarm_rule, statistics):
"""Evaluate alarm rule.
:returns: state, trending state and statistics.
"""
start, end = self._bound_duration(alarm_rule)
statistics = self._statistics(alarm_rule, start, end)
statistics = self._sanitize(alarm_rule, statistics)
sufficient = len(statistics) >= alarm_rule['evaluation_periods']
if not sufficient:
raise InsufficientDataError(
'%d datapoints are unknown' % alarm_rule['evaluation_periods'],
statistics)
def _compare(value): def _compare(value):
op = COMPARATORS[alarm_rule['comparison_operator']] op = COMPARATORS[alarm_rule['comparison_operator']]
@ -129,6 +117,31 @@ class ThresholdEvaluator(evaluator.Evaluator):
trending_state = evaluator.ALARM if compared[-1] else evaluator.OK trending_state = evaluator.ALARM if compared[-1] else evaluator.OK
return None, trending_state, statistics, number_outside, None return None, trending_state, statistics, number_outside, None
def evaluate_rule(self, alarm_rule):
"""Evaluate alarm rule.
:returns: state, trending state and statistics.
"""
start, end = self._bound_duration(alarm_rule)
statistics = self._statistics(alarm_rule, start, end)
statistics = self._sanitize(alarm_rule, statistics)
sufficient = len(statistics) >= alarm_rule['evaluation_periods']
if not sufficient:
raise InsufficientDataError(
'%d datapoints are unknown' % alarm_rule['evaluation_periods'],
statistics)
return self._process_statistics(alarm_rule, statistics)
def _unknown_reason_data(self, alarm, statistics):
LOG.warning(f'Expecting {alarm.rule["evaluation_periods"]} datapoints'
f' but only get {len(statistics)}')
# Reason is not same as log message because we want to keep
# consistent since thirdparty software may depend on old format.
last = None if not statistics else statistics[-1]
return self._reason_data('unknown', alarm.rule['evaluation_periods'],
last)
def _transition_alarm(self, alarm, state, trending_state, statistics, def _transition_alarm(self, alarm, state, trending_state, statistics,
outside_count, unknown_reason): outside_count, unknown_reason):
unknown = alarm.state == evaluator.UNKNOWN unknown = alarm.state == evaluator.UNKNOWN
@ -143,16 +156,7 @@ class ThresholdEvaluator(evaluator.Evaluator):
return return
if state == evaluator.UNKNOWN and not unknown: if state == evaluator.UNKNOWN and not unknown:
LOG.warning('Expecting %(expected)d datapoints but only get ' reason_data = self._unknown_reason_data(alarm, statistics)
'%(actual)d'
% {'expected': alarm.rule['evaluation_periods'],
'actual': len(statistics)})
# Reason is not same as log message because we want to keep
# consistent since thirdparty software may depend on old format.
last = None if not statistics else statistics[-1]
reason_data = self._reason_data('unknown',
alarm.rule['evaluation_periods'],
last)
self._refresh(alarm, state, unknown_reason, reason_data) self._refresh(alarm, state, unknown_reason, reason_data)
elif state and (alarm.state != state or continuous): elif state and (alarm.state != state or continuous):

View File

@ -23,6 +23,7 @@ import aodh.evaluator
import aodh.evaluator.event import aodh.evaluator.event
import aodh.evaluator.gnocchi import aodh.evaluator.gnocchi
import aodh.evaluator.loadbalancer import aodh.evaluator.loadbalancer
import aodh.evaluator.prometheus
import aodh.evaluator.threshold import aodh.evaluator.threshold
import aodh.event import aodh.event
import aodh.keystone_client import aodh.keystone_client
@ -38,6 +39,7 @@ def list_opts():
itertools.chain( itertools.chain(
aodh.evaluator.OPTS, aodh.evaluator.OPTS,
aodh.evaluator.event.OPTS, aodh.evaluator.event.OPTS,
aodh.evaluator.prometheus.OPTS,
aodh.evaluator.threshold.OPTS, aodh.evaluator.threshold.OPTS,
aodh.evaluator.loadbalancer.OPTS, aodh.evaluator.loadbalancer.OPTS,
aodh.notifier.rest.OPTS, aodh.notifier.rest.OPTS,

View File

@ -15,6 +15,7 @@
from unittest import mock from unittest import mock
import fixtures import fixtures
import os
from oslo_utils import timeutils from oslo_utils import timeutils
from oslo_utils import uuidutils from oslo_utils import uuidutils
@ -25,6 +26,12 @@ from aodh.tests import constants
from aodh.tests.unit.evaluator import base from aodh.tests.unit.evaluator import base
# NOTE(mmagr): Overriding PrometheusEvaluator setting to avoid
# complains during init.
os.environ['PROMETHEUS_HOST'] = '127.0.0.1'
os.environ['PROMETHEUS_PORT'] = '666'
class BaseCompositeEvaluate(base.TestEvaluatorBase): class BaseCompositeEvaluate(base.TestEvaluatorBase):
EVALUATOR = composite.CompositeEvaluator EVALUATOR = composite.CompositeEvaluator

View File

@ -18,11 +18,14 @@ import fixtures
import time import time
from unittest import mock from unittest import mock
from observabilityclient import prometheus_client
from oslo_config import fixture as fixture_config from oslo_config import fixture as fixture_config
from stevedore import extension from stevedore import extension
from aodh import evaluator from aodh import evaluator
from aodh import service from aodh import service
from aodh.evaluator import prometheus
from aodh.tests import base as tests_base from aodh.tests import base as tests_base
@ -190,3 +193,59 @@ class TestAlarmEvaluationService(tests_base.BaseTestCase):
target = svc.partition_coordinator.extract_my_subset target = svc.partition_coordinator.extract_my_subset
self.assertEqual(0, target.call_count) self.assertEqual(0, target.call_count)
self.assertEqual(0, self.threshold_eval.evaluate.call_count) self.assertEqual(0, self.threshold_eval.evaluate.call_count)
class TestPrometheusEvaluator(tests_base.BaseTestCase):
def setUp(self):
super(TestPrometheusEvaluator, self).setUp()
conf = service.prepare_service(argv=[], config_files=[])
self.CONF = self.useFixture(fixture_config.Config(conf)).conf
def test_rule_evaluation(self):
metric_list = [
prometheus_client.PrometheusMetric({'metric': 'mtr',
'value': (0, 10)}),
prometheus_client.PrometheusMetric({'metric': 'mtr',
'value': (1, 15)}),
prometheus_client.PrometheusMetric({'metric': 'mtr',
'value': (2, 20)}),
prometheus_client.PrometheusMetric({'metric': 'mtr',
'value': (3, 25)}),
prometheus_client.PrometheusMetric({'metric': 'mtr',
'value': (4, 30)}),
prometheus_client.PrometheusMetric({'metric': 'mtr',
'value': (5, 15)}),
]
with mock.patch.object(prometheus.PrometheusEvaluator,
'_set_obsclient', return_value=None):
# mock Prometheus client
ev = prometheus.PrometheusEvaluator(self.CONF)
ev._get_metric_data = mock.Mock(return_value=metric_list)
# test transfer to alarm state
state, trend, stats, outside, reason = ev.evaluate_rule(
{'query': 'mtr', 'threshold': 9,
'comparison_operator': 'gt'})
self.assertEqual('alarm', state)
self.assertEqual(6, outside)
# test transfer to ok state
state, trend, stats, outside, reason = ev.evaluate_rule(
{'query': 'mtr', 'threshold': 31,
'comparison_operator': 'gt'})
self.assertEqual('ok', state)
self.assertEqual(0, outside)
# test trending to alarm state
state, trend, stats, outside, reason = ev.evaluate_rule(
{'query': 'mtr', 'threshold': 14,
'comparison_operator': 'gt'})
self.assertEqual('alarm', trend)
self.assertEqual(5, outside)
# test trending to ok state
state, trend, stats, outside, reason = ev.evaluate_rule(
{'query': 'mtr', 'threshold': 20,
'comparison_operator': 'gt'})
self.assertEqual('ok', trend)
self.assertEqual(2, outside)

View File

@ -36,6 +36,7 @@ cachetools>=1.1.6
cotyledon>=1.7.3 cotyledon>=1.7.3
keystoneauth1>=2.1 keystoneauth1>=2.1
debtcollector>=1.2.0 # Apache-2.0 debtcollector>=1.2.0 # Apache-2.0
python-observabilityclient>=0.0.4
python-octaviaclient>=1.8.0 python-octaviaclient>=1.8.0
python-dateutil>=2.8.2 # BSD python-dateutil>=2.8.2 # BSD
python-heatclient>=1.17.0 python-heatclient>=1.17.0

View File

@ -57,6 +57,7 @@ aodh.alarm.rule =
event = aodh.api.controllers.v2.alarm_rules.event:AlarmEventRule event = aodh.api.controllers.v2.alarm_rules.event:AlarmEventRule
composite = aodh.api.controllers.v2.alarm_rules.composite:composite_rule composite = aodh.api.controllers.v2.alarm_rules.composite:composite_rule
loadbalancer_member_health = aodh.api.controllers.v2.alarm_rules.loadbalancer:LoadBalancerMemberHealthRule loadbalancer_member_health = aodh.api.controllers.v2.alarm_rules.loadbalancer:LoadBalancerMemberHealthRule
prometheus = aodh.api.controllers.v2.alarm_rules.prometheus:PrometheusRule
aodh.evaluator = aodh.evaluator =
gnocchi_resources_threshold = aodh.evaluator.gnocchi:GnocchiResourceThresholdEvaluator gnocchi_resources_threshold = aodh.evaluator.gnocchi:GnocchiResourceThresholdEvaluator
@ -64,6 +65,7 @@ aodh.evaluator =
gnocchi_aggregation_by_resources_threshold = aodh.evaluator.gnocchi:GnocchiAggregationResourcesThresholdEvaluator gnocchi_aggregation_by_resources_threshold = aodh.evaluator.gnocchi:GnocchiAggregationResourcesThresholdEvaluator
composite = aodh.evaluator.composite:CompositeEvaluator composite = aodh.evaluator.composite:CompositeEvaluator
loadbalancer_member_health = aodh.evaluator.loadbalancer:LoadBalancerMemberHealthEvaluator loadbalancer_member_health = aodh.evaluator.loadbalancer:LoadBalancerMemberHealthEvaluator
prometheus = aodh.evaluator.prometheus:PrometheusEvaluator
aodh.notifier = aodh.notifier =
log = aodh.notifier.log:LogAlarmNotifier log = aodh.notifier.log:LogAlarmNotifier