Merge "Add Prometheus evaluator"
This commit is contained in:
commit
197440c3c8
@ -41,7 +41,8 @@ class CompositeRule(wtypes.UserType):
|
||||
threshold_plugins = None
|
||||
|
||||
def __init__(self):
|
||||
threshold_rules = ('gnocchi_resources_threshold',
|
||||
threshold_rules = ('prometheus',
|
||||
'gnocchi_resources_threshold',
|
||||
'gnocchi_aggregation_by_metrics_threshold',
|
||||
'gnocchi_aggregation_by_resources_threshold')
|
||||
CompositeRule.threshold_plugins = named.NamedExtensionManager(
|
||||
|
46
aodh/api/controllers/v2/alarm_rules/prometheus.py
Normal file
46
aodh/api/controllers/v2/alarm_rules/prometheus.py
Normal file
@ -0,0 +1,46 @@
|
||||
#
|
||||
# Copyright 2023 Red Hat, Inc
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from oslo_log import log
|
||||
import wsme
|
||||
from wsme import types as wtypes
|
||||
|
||||
from aodh.api.controllers.v2 import base
|
||||
|
||||
|
||||
LOG = log.getLogger(__name__)
|
||||
|
||||
|
||||
class PrometheusRule(base.AlarmRule):
|
||||
comparison_operator = base.AdvEnum('comparison_operator', str,
|
||||
'lt', 'le', 'eq', 'ne', 'ge', 'gt',
|
||||
default='eq')
|
||||
"The comparison against the alarm threshold"
|
||||
|
||||
threshold = wsme.wsattr(float, mandatory=True)
|
||||
"The threshold of the alarm"
|
||||
|
||||
query = wsme.wsattr(wtypes.text, mandatory=True)
|
||||
"The Prometheus query"
|
||||
|
||||
@staticmethod
|
||||
def validate(rule):
|
||||
# TO-DO(mmagr): validate Prometheus query maybe?
|
||||
return rule
|
||||
|
||||
def as_dict(self):
|
||||
rule = self.as_dict_from_keys(['comparison_operator', 'threshold',
|
||||
'query'])
|
||||
return rule
|
@ -116,7 +116,8 @@ class CompositeEvaluator(evaluator.Evaluator):
|
||||
@property
|
||||
def threshold_evaluators(self):
|
||||
if not self._threshold_evaluators:
|
||||
threshold_types = ('gnocchi_resources_threshold',
|
||||
threshold_types = ('prometheus',
|
||||
'gnocchi_resources_threshold',
|
||||
'gnocchi_aggregation_by_metrics_threshold',
|
||||
'gnocchi_aggregation_by_resources_threshold')
|
||||
self._threshold_evaluators = stevedore.NamedExtensionManager(
|
||||
|
78
aodh/evaluator/prometheus.py
Normal file
78
aodh/evaluator/prometheus.py
Normal file
@ -0,0 +1,78 @@
|
||||
#
|
||||
# Copyright 2023 Red Hat, Inc
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from oslo_config import cfg
|
||||
from oslo_log import log
|
||||
|
||||
from observabilityclient import client
|
||||
|
||||
from aodh.evaluator import threshold
|
||||
from aodh import keystone_client
|
||||
|
||||
|
||||
LOG = log.getLogger(__name__)
|
||||
OPTS = [
|
||||
cfg.BoolOpt('prometheus_disable_rbac',
|
||||
default=False,
|
||||
help='Disable RBAC for Prometheus evaluator.'),
|
||||
]
|
||||
|
||||
|
||||
class PrometheusBase(threshold.ThresholdEvaluator):
|
||||
def __init__(self, conf):
|
||||
super(PrometheusBase, self).__init__(conf)
|
||||
self._set_obsclient(conf)
|
||||
self._no_rbac = conf.prometheus_disable_rbac
|
||||
|
||||
def _set_obsclient(self, conf):
|
||||
session = keystone_client.get_session(conf)
|
||||
opts = {'interface': conf.service_credentials.interface,
|
||||
'region_name': conf.service_credentials.region_name}
|
||||
self._prom = client.Client('1', session, adapter_options=opts)
|
||||
|
||||
def _get_metric_data(self, query):
|
||||
LOG.debug(f'Querying Prometheus instance on: {query}')
|
||||
return self._prom.query.query(query, disable_rbac=self._no_rbac)
|
||||
|
||||
|
||||
class PrometheusEvaluator(PrometheusBase):
|
||||
|
||||
def _sanitize(self, metric_data):
|
||||
sanitized = [float(m.value) for m in metric_data]
|
||||
LOG.debug(f'Sanited Prometheus metric data: {metric_data}'
|
||||
f' to statistics: {sanitized}')
|
||||
return sanitized
|
||||
|
||||
def evaluate_rule(self, alarm_rule):
|
||||
"""Evaluate alarm rule.
|
||||
|
||||
:returns: state, trending state, statistics, number of samples outside
|
||||
threshold and reason
|
||||
"""
|
||||
metrics = self._get_metric_data(alarm_rule['query'])
|
||||
if not metrics:
|
||||
LOG.warning("Empty result fetched from Prometheus for query"
|
||||
f" {alarm_rule['query']}")
|
||||
|
||||
statistics = self._sanitize(metrics)
|
||||
if not statistics:
|
||||
raise threshold.InsufficientDataError('datapoints are unknown',
|
||||
statistics)
|
||||
return self._process_statistics(alarm_rule, statistics)
|
||||
|
||||
def _unknown_reason_data(self, alarm, statistics):
|
||||
LOG.warning(f'Transfering alarm {alarm} on unknown reason')
|
||||
last = None if not statistics else statistics[-1]
|
||||
return self._reason_data('unknown', len(statistics), last)
|
@ -96,19 +96,7 @@ class ThresholdEvaluator(evaluator.Evaluator):
|
||||
' %(disposition)s threshold, most recent: %(most_recent)s'
|
||||
% dict(reason_data, state=state), reason_data)
|
||||
|
||||
def evaluate_rule(self, alarm_rule):
|
||||
"""Evaluate alarm rule.
|
||||
|
||||
:returns: state, trending state and statistics.
|
||||
"""
|
||||
start, end = self._bound_duration(alarm_rule)
|
||||
statistics = self._statistics(alarm_rule, start, end)
|
||||
statistics = self._sanitize(alarm_rule, statistics)
|
||||
sufficient = len(statistics) >= alarm_rule['evaluation_periods']
|
||||
if not sufficient:
|
||||
raise InsufficientDataError(
|
||||
'%d datapoints are unknown' % alarm_rule['evaluation_periods'],
|
||||
statistics)
|
||||
def _process_statistics(self, alarm_rule, statistics):
|
||||
|
||||
def _compare(value):
|
||||
op = COMPARATORS[alarm_rule['comparison_operator']]
|
||||
@ -129,6 +117,31 @@ class ThresholdEvaluator(evaluator.Evaluator):
|
||||
trending_state = evaluator.ALARM if compared[-1] else evaluator.OK
|
||||
return None, trending_state, statistics, number_outside, None
|
||||
|
||||
def evaluate_rule(self, alarm_rule):
|
||||
"""Evaluate alarm rule.
|
||||
|
||||
:returns: state, trending state and statistics.
|
||||
"""
|
||||
start, end = self._bound_duration(alarm_rule)
|
||||
statistics = self._statistics(alarm_rule, start, end)
|
||||
statistics = self._sanitize(alarm_rule, statistics)
|
||||
sufficient = len(statistics) >= alarm_rule['evaluation_periods']
|
||||
if not sufficient:
|
||||
raise InsufficientDataError(
|
||||
'%d datapoints are unknown' % alarm_rule['evaluation_periods'],
|
||||
statistics)
|
||||
|
||||
return self._process_statistics(alarm_rule, statistics)
|
||||
|
||||
def _unknown_reason_data(self, alarm, statistics):
|
||||
LOG.warning(f'Expecting {alarm.rule["evaluation_periods"]} datapoints'
|
||||
f' but only get {len(statistics)}')
|
||||
# Reason is not same as log message because we want to keep
|
||||
# consistent since thirdparty software may depend on old format.
|
||||
last = None if not statistics else statistics[-1]
|
||||
return self._reason_data('unknown', alarm.rule['evaluation_periods'],
|
||||
last)
|
||||
|
||||
def _transition_alarm(self, alarm, state, trending_state, statistics,
|
||||
outside_count, unknown_reason):
|
||||
unknown = alarm.state == evaluator.UNKNOWN
|
||||
@ -143,16 +156,7 @@ class ThresholdEvaluator(evaluator.Evaluator):
|
||||
return
|
||||
|
||||
if state == evaluator.UNKNOWN and not unknown:
|
||||
LOG.warning('Expecting %(expected)d datapoints but only get '
|
||||
'%(actual)d'
|
||||
% {'expected': alarm.rule['evaluation_periods'],
|
||||
'actual': len(statistics)})
|
||||
# Reason is not same as log message because we want to keep
|
||||
# consistent since thirdparty software may depend on old format.
|
||||
last = None if not statistics else statistics[-1]
|
||||
reason_data = self._reason_data('unknown',
|
||||
alarm.rule['evaluation_periods'],
|
||||
last)
|
||||
reason_data = self._unknown_reason_data(alarm, statistics)
|
||||
self._refresh(alarm, state, unknown_reason, reason_data)
|
||||
|
||||
elif state and (alarm.state != state or continuous):
|
||||
|
@ -23,6 +23,7 @@ import aodh.evaluator
|
||||
import aodh.evaluator.event
|
||||
import aodh.evaluator.gnocchi
|
||||
import aodh.evaluator.loadbalancer
|
||||
import aodh.evaluator.prometheus
|
||||
import aodh.evaluator.threshold
|
||||
import aodh.event
|
||||
import aodh.keystone_client
|
||||
@ -38,6 +39,7 @@ def list_opts():
|
||||
itertools.chain(
|
||||
aodh.evaluator.OPTS,
|
||||
aodh.evaluator.event.OPTS,
|
||||
aodh.evaluator.prometheus.OPTS,
|
||||
aodh.evaluator.threshold.OPTS,
|
||||
aodh.evaluator.loadbalancer.OPTS,
|
||||
aodh.notifier.rest.OPTS,
|
||||
|
@ -15,6 +15,7 @@
|
||||
from unittest import mock
|
||||
|
||||
import fixtures
|
||||
import os
|
||||
from oslo_utils import timeutils
|
||||
from oslo_utils import uuidutils
|
||||
|
||||
@ -25,6 +26,12 @@ from aodh.tests import constants
|
||||
from aodh.tests.unit.evaluator import base
|
||||
|
||||
|
||||
# NOTE(mmagr): Overriding PrometheusEvaluator setting to avoid
|
||||
# complains during init.
|
||||
os.environ['PROMETHEUS_HOST'] = '127.0.0.1'
|
||||
os.environ['PROMETHEUS_PORT'] = '666'
|
||||
|
||||
|
||||
class BaseCompositeEvaluate(base.TestEvaluatorBase):
|
||||
EVALUATOR = composite.CompositeEvaluator
|
||||
|
||||
|
@ -18,11 +18,14 @@ import fixtures
|
||||
import time
|
||||
from unittest import mock
|
||||
|
||||
from observabilityclient import prometheus_client
|
||||
from oslo_config import fixture as fixture_config
|
||||
from stevedore import extension
|
||||
|
||||
from aodh import evaluator
|
||||
from aodh import service
|
||||
|
||||
from aodh.evaluator import prometheus
|
||||
from aodh.tests import base as tests_base
|
||||
|
||||
|
||||
@ -190,3 +193,59 @@ class TestAlarmEvaluationService(tests_base.BaseTestCase):
|
||||
target = svc.partition_coordinator.extract_my_subset
|
||||
self.assertEqual(0, target.call_count)
|
||||
self.assertEqual(0, self.threshold_eval.evaluate.call_count)
|
||||
|
||||
|
||||
class TestPrometheusEvaluator(tests_base.BaseTestCase):
|
||||
def setUp(self):
|
||||
super(TestPrometheusEvaluator, self).setUp()
|
||||
conf = service.prepare_service(argv=[], config_files=[])
|
||||
self.CONF = self.useFixture(fixture_config.Config(conf)).conf
|
||||
|
||||
def test_rule_evaluation(self):
|
||||
metric_list = [
|
||||
prometheus_client.PrometheusMetric({'metric': 'mtr',
|
||||
'value': (0, 10)}),
|
||||
prometheus_client.PrometheusMetric({'metric': 'mtr',
|
||||
'value': (1, 15)}),
|
||||
prometheus_client.PrometheusMetric({'metric': 'mtr',
|
||||
'value': (2, 20)}),
|
||||
prometheus_client.PrometheusMetric({'metric': 'mtr',
|
||||
'value': (3, 25)}),
|
||||
prometheus_client.PrometheusMetric({'metric': 'mtr',
|
||||
'value': (4, 30)}),
|
||||
prometheus_client.PrometheusMetric({'metric': 'mtr',
|
||||
'value': (5, 15)}),
|
||||
]
|
||||
with mock.patch.object(prometheus.PrometheusEvaluator,
|
||||
'_set_obsclient', return_value=None):
|
||||
# mock Prometheus client
|
||||
ev = prometheus.PrometheusEvaluator(self.CONF)
|
||||
ev._get_metric_data = mock.Mock(return_value=metric_list)
|
||||
|
||||
# test transfer to alarm state
|
||||
state, trend, stats, outside, reason = ev.evaluate_rule(
|
||||
{'query': 'mtr', 'threshold': 9,
|
||||
'comparison_operator': 'gt'})
|
||||
self.assertEqual('alarm', state)
|
||||
self.assertEqual(6, outside)
|
||||
|
||||
# test transfer to ok state
|
||||
state, trend, stats, outside, reason = ev.evaluate_rule(
|
||||
{'query': 'mtr', 'threshold': 31,
|
||||
'comparison_operator': 'gt'})
|
||||
self.assertEqual('ok', state)
|
||||
self.assertEqual(0, outside)
|
||||
|
||||
# test trending to alarm state
|
||||
state, trend, stats, outside, reason = ev.evaluate_rule(
|
||||
{'query': 'mtr', 'threshold': 14,
|
||||
'comparison_operator': 'gt'})
|
||||
self.assertEqual('alarm', trend)
|
||||
self.assertEqual(5, outside)
|
||||
|
||||
# test trending to ok state
|
||||
state, trend, stats, outside, reason = ev.evaluate_rule(
|
||||
{'query': 'mtr', 'threshold': 20,
|
||||
'comparison_operator': 'gt'})
|
||||
self.assertEqual('ok', trend)
|
||||
self.assertEqual(2, outside)
|
||||
|
@ -36,6 +36,7 @@ cachetools>=1.1.6
|
||||
cotyledon>=1.7.3
|
||||
keystoneauth1>=2.1
|
||||
debtcollector>=1.2.0 # Apache-2.0
|
||||
python-observabilityclient>=0.0.4
|
||||
python-octaviaclient>=1.8.0
|
||||
python-dateutil>=2.8.2 # BSD
|
||||
python-heatclient>=1.17.0
|
||||
|
@ -57,6 +57,7 @@ aodh.alarm.rule =
|
||||
event = aodh.api.controllers.v2.alarm_rules.event:AlarmEventRule
|
||||
composite = aodh.api.controllers.v2.alarm_rules.composite:composite_rule
|
||||
loadbalancer_member_health = aodh.api.controllers.v2.alarm_rules.loadbalancer:LoadBalancerMemberHealthRule
|
||||
prometheus = aodh.api.controllers.v2.alarm_rules.prometheus:PrometheusRule
|
||||
|
||||
aodh.evaluator =
|
||||
gnocchi_resources_threshold = aodh.evaluator.gnocchi:GnocchiResourceThresholdEvaluator
|
||||
@ -64,6 +65,7 @@ aodh.evaluator =
|
||||
gnocchi_aggregation_by_resources_threshold = aodh.evaluator.gnocchi:GnocchiAggregationResourcesThresholdEvaluator
|
||||
composite = aodh.evaluator.composite:CompositeEvaluator
|
||||
loadbalancer_member_health = aodh.evaluator.loadbalancer:LoadBalancerMemberHealthEvaluator
|
||||
prometheus = aodh.evaluator.prometheus:PrometheusEvaluator
|
||||
|
||||
aodh.notifier =
|
||||
log = aodh.notifier.log:LogAlarmNotifier
|
||||
|
Loading…
x
Reference in New Issue
Block a user