Merge "Add Prometheus evaluator"
This commit is contained in:
commit
197440c3c8
@ -41,7 +41,8 @@ class CompositeRule(wtypes.UserType):
|
|||||||
threshold_plugins = None
|
threshold_plugins = None
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
threshold_rules = ('gnocchi_resources_threshold',
|
threshold_rules = ('prometheus',
|
||||||
|
'gnocchi_resources_threshold',
|
||||||
'gnocchi_aggregation_by_metrics_threshold',
|
'gnocchi_aggregation_by_metrics_threshold',
|
||||||
'gnocchi_aggregation_by_resources_threshold')
|
'gnocchi_aggregation_by_resources_threshold')
|
||||||
CompositeRule.threshold_plugins = named.NamedExtensionManager(
|
CompositeRule.threshold_plugins = named.NamedExtensionManager(
|
||||||
|
46
aodh/api/controllers/v2/alarm_rules/prometheus.py
Normal file
46
aodh/api/controllers/v2/alarm_rules/prometheus.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
#
|
||||||
|
# Copyright 2023 Red Hat, Inc
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
# not use this file except in compliance with the License. You may obtain
|
||||||
|
# a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||||
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||||
|
# License for the specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
|
||||||
|
from oslo_log import log
|
||||||
|
import wsme
|
||||||
|
from wsme import types as wtypes
|
||||||
|
|
||||||
|
from aodh.api.controllers.v2 import base
|
||||||
|
|
||||||
|
|
||||||
|
LOG = log.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class PrometheusRule(base.AlarmRule):
|
||||||
|
comparison_operator = base.AdvEnum('comparison_operator', str,
|
||||||
|
'lt', 'le', 'eq', 'ne', 'ge', 'gt',
|
||||||
|
default='eq')
|
||||||
|
"The comparison against the alarm threshold"
|
||||||
|
|
||||||
|
threshold = wsme.wsattr(float, mandatory=True)
|
||||||
|
"The threshold of the alarm"
|
||||||
|
|
||||||
|
query = wsme.wsattr(wtypes.text, mandatory=True)
|
||||||
|
"The Prometheus query"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def validate(rule):
|
||||||
|
# TO-DO(mmagr): validate Prometheus query maybe?
|
||||||
|
return rule
|
||||||
|
|
||||||
|
def as_dict(self):
|
||||||
|
rule = self.as_dict_from_keys(['comparison_operator', 'threshold',
|
||||||
|
'query'])
|
||||||
|
return rule
|
@ -116,7 +116,8 @@ class CompositeEvaluator(evaluator.Evaluator):
|
|||||||
@property
|
@property
|
||||||
def threshold_evaluators(self):
|
def threshold_evaluators(self):
|
||||||
if not self._threshold_evaluators:
|
if not self._threshold_evaluators:
|
||||||
threshold_types = ('gnocchi_resources_threshold',
|
threshold_types = ('prometheus',
|
||||||
|
'gnocchi_resources_threshold',
|
||||||
'gnocchi_aggregation_by_metrics_threshold',
|
'gnocchi_aggregation_by_metrics_threshold',
|
||||||
'gnocchi_aggregation_by_resources_threshold')
|
'gnocchi_aggregation_by_resources_threshold')
|
||||||
self._threshold_evaluators = stevedore.NamedExtensionManager(
|
self._threshold_evaluators = stevedore.NamedExtensionManager(
|
||||||
|
78
aodh/evaluator/prometheus.py
Normal file
78
aodh/evaluator/prometheus.py
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
#
|
||||||
|
# Copyright 2023 Red Hat, Inc
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
# not use this file except in compliance with the License. You may obtain
|
||||||
|
# a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||||
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||||
|
# License for the specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
|
||||||
|
from oslo_config import cfg
|
||||||
|
from oslo_log import log
|
||||||
|
|
||||||
|
from observabilityclient import client
|
||||||
|
|
||||||
|
from aodh.evaluator import threshold
|
||||||
|
from aodh import keystone_client
|
||||||
|
|
||||||
|
|
||||||
|
LOG = log.getLogger(__name__)
|
||||||
|
OPTS = [
|
||||||
|
cfg.BoolOpt('prometheus_disable_rbac',
|
||||||
|
default=False,
|
||||||
|
help='Disable RBAC for Prometheus evaluator.'),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class PrometheusBase(threshold.ThresholdEvaluator):
|
||||||
|
def __init__(self, conf):
|
||||||
|
super(PrometheusBase, self).__init__(conf)
|
||||||
|
self._set_obsclient(conf)
|
||||||
|
self._no_rbac = conf.prometheus_disable_rbac
|
||||||
|
|
||||||
|
def _set_obsclient(self, conf):
|
||||||
|
session = keystone_client.get_session(conf)
|
||||||
|
opts = {'interface': conf.service_credentials.interface,
|
||||||
|
'region_name': conf.service_credentials.region_name}
|
||||||
|
self._prom = client.Client('1', session, adapter_options=opts)
|
||||||
|
|
||||||
|
def _get_metric_data(self, query):
|
||||||
|
LOG.debug(f'Querying Prometheus instance on: {query}')
|
||||||
|
return self._prom.query.query(query, disable_rbac=self._no_rbac)
|
||||||
|
|
||||||
|
|
||||||
|
class PrometheusEvaluator(PrometheusBase):
|
||||||
|
|
||||||
|
def _sanitize(self, metric_data):
|
||||||
|
sanitized = [float(m.value) for m in metric_data]
|
||||||
|
LOG.debug(f'Sanited Prometheus metric data: {metric_data}'
|
||||||
|
f' to statistics: {sanitized}')
|
||||||
|
return sanitized
|
||||||
|
|
||||||
|
def evaluate_rule(self, alarm_rule):
|
||||||
|
"""Evaluate alarm rule.
|
||||||
|
|
||||||
|
:returns: state, trending state, statistics, number of samples outside
|
||||||
|
threshold and reason
|
||||||
|
"""
|
||||||
|
metrics = self._get_metric_data(alarm_rule['query'])
|
||||||
|
if not metrics:
|
||||||
|
LOG.warning("Empty result fetched from Prometheus for query"
|
||||||
|
f" {alarm_rule['query']}")
|
||||||
|
|
||||||
|
statistics = self._sanitize(metrics)
|
||||||
|
if not statistics:
|
||||||
|
raise threshold.InsufficientDataError('datapoints are unknown',
|
||||||
|
statistics)
|
||||||
|
return self._process_statistics(alarm_rule, statistics)
|
||||||
|
|
||||||
|
def _unknown_reason_data(self, alarm, statistics):
|
||||||
|
LOG.warning(f'Transfering alarm {alarm} on unknown reason')
|
||||||
|
last = None if not statistics else statistics[-1]
|
||||||
|
return self._reason_data('unknown', len(statistics), last)
|
@ -96,19 +96,7 @@ class ThresholdEvaluator(evaluator.Evaluator):
|
|||||||
' %(disposition)s threshold, most recent: %(most_recent)s'
|
' %(disposition)s threshold, most recent: %(most_recent)s'
|
||||||
% dict(reason_data, state=state), reason_data)
|
% dict(reason_data, state=state), reason_data)
|
||||||
|
|
||||||
def evaluate_rule(self, alarm_rule):
|
def _process_statistics(self, alarm_rule, statistics):
|
||||||
"""Evaluate alarm rule.
|
|
||||||
|
|
||||||
:returns: state, trending state and statistics.
|
|
||||||
"""
|
|
||||||
start, end = self._bound_duration(alarm_rule)
|
|
||||||
statistics = self._statistics(alarm_rule, start, end)
|
|
||||||
statistics = self._sanitize(alarm_rule, statistics)
|
|
||||||
sufficient = len(statistics) >= alarm_rule['evaluation_periods']
|
|
||||||
if not sufficient:
|
|
||||||
raise InsufficientDataError(
|
|
||||||
'%d datapoints are unknown' % alarm_rule['evaluation_periods'],
|
|
||||||
statistics)
|
|
||||||
|
|
||||||
def _compare(value):
|
def _compare(value):
|
||||||
op = COMPARATORS[alarm_rule['comparison_operator']]
|
op = COMPARATORS[alarm_rule['comparison_operator']]
|
||||||
@ -129,6 +117,31 @@ class ThresholdEvaluator(evaluator.Evaluator):
|
|||||||
trending_state = evaluator.ALARM if compared[-1] else evaluator.OK
|
trending_state = evaluator.ALARM if compared[-1] else evaluator.OK
|
||||||
return None, trending_state, statistics, number_outside, None
|
return None, trending_state, statistics, number_outside, None
|
||||||
|
|
||||||
|
def evaluate_rule(self, alarm_rule):
|
||||||
|
"""Evaluate alarm rule.
|
||||||
|
|
||||||
|
:returns: state, trending state and statistics.
|
||||||
|
"""
|
||||||
|
start, end = self._bound_duration(alarm_rule)
|
||||||
|
statistics = self._statistics(alarm_rule, start, end)
|
||||||
|
statistics = self._sanitize(alarm_rule, statistics)
|
||||||
|
sufficient = len(statistics) >= alarm_rule['evaluation_periods']
|
||||||
|
if not sufficient:
|
||||||
|
raise InsufficientDataError(
|
||||||
|
'%d datapoints are unknown' % alarm_rule['evaluation_periods'],
|
||||||
|
statistics)
|
||||||
|
|
||||||
|
return self._process_statistics(alarm_rule, statistics)
|
||||||
|
|
||||||
|
def _unknown_reason_data(self, alarm, statistics):
|
||||||
|
LOG.warning(f'Expecting {alarm.rule["evaluation_periods"]} datapoints'
|
||||||
|
f' but only get {len(statistics)}')
|
||||||
|
# Reason is not same as log message because we want to keep
|
||||||
|
# consistent since thirdparty software may depend on old format.
|
||||||
|
last = None if not statistics else statistics[-1]
|
||||||
|
return self._reason_data('unknown', alarm.rule['evaluation_periods'],
|
||||||
|
last)
|
||||||
|
|
||||||
def _transition_alarm(self, alarm, state, trending_state, statistics,
|
def _transition_alarm(self, alarm, state, trending_state, statistics,
|
||||||
outside_count, unknown_reason):
|
outside_count, unknown_reason):
|
||||||
unknown = alarm.state == evaluator.UNKNOWN
|
unknown = alarm.state == evaluator.UNKNOWN
|
||||||
@ -143,16 +156,7 @@ class ThresholdEvaluator(evaluator.Evaluator):
|
|||||||
return
|
return
|
||||||
|
|
||||||
if state == evaluator.UNKNOWN and not unknown:
|
if state == evaluator.UNKNOWN and not unknown:
|
||||||
LOG.warning('Expecting %(expected)d datapoints but only get '
|
reason_data = self._unknown_reason_data(alarm, statistics)
|
||||||
'%(actual)d'
|
|
||||||
% {'expected': alarm.rule['evaluation_periods'],
|
|
||||||
'actual': len(statistics)})
|
|
||||||
# Reason is not same as log message because we want to keep
|
|
||||||
# consistent since thirdparty software may depend on old format.
|
|
||||||
last = None if not statistics else statistics[-1]
|
|
||||||
reason_data = self._reason_data('unknown',
|
|
||||||
alarm.rule['evaluation_periods'],
|
|
||||||
last)
|
|
||||||
self._refresh(alarm, state, unknown_reason, reason_data)
|
self._refresh(alarm, state, unknown_reason, reason_data)
|
||||||
|
|
||||||
elif state and (alarm.state != state or continuous):
|
elif state and (alarm.state != state or continuous):
|
||||||
|
@ -23,6 +23,7 @@ import aodh.evaluator
|
|||||||
import aodh.evaluator.event
|
import aodh.evaluator.event
|
||||||
import aodh.evaluator.gnocchi
|
import aodh.evaluator.gnocchi
|
||||||
import aodh.evaluator.loadbalancer
|
import aodh.evaluator.loadbalancer
|
||||||
|
import aodh.evaluator.prometheus
|
||||||
import aodh.evaluator.threshold
|
import aodh.evaluator.threshold
|
||||||
import aodh.event
|
import aodh.event
|
||||||
import aodh.keystone_client
|
import aodh.keystone_client
|
||||||
@ -38,6 +39,7 @@ def list_opts():
|
|||||||
itertools.chain(
|
itertools.chain(
|
||||||
aodh.evaluator.OPTS,
|
aodh.evaluator.OPTS,
|
||||||
aodh.evaluator.event.OPTS,
|
aodh.evaluator.event.OPTS,
|
||||||
|
aodh.evaluator.prometheus.OPTS,
|
||||||
aodh.evaluator.threshold.OPTS,
|
aodh.evaluator.threshold.OPTS,
|
||||||
aodh.evaluator.loadbalancer.OPTS,
|
aodh.evaluator.loadbalancer.OPTS,
|
||||||
aodh.notifier.rest.OPTS,
|
aodh.notifier.rest.OPTS,
|
||||||
|
@ -15,6 +15,7 @@
|
|||||||
from unittest import mock
|
from unittest import mock
|
||||||
|
|
||||||
import fixtures
|
import fixtures
|
||||||
|
import os
|
||||||
from oslo_utils import timeutils
|
from oslo_utils import timeutils
|
||||||
from oslo_utils import uuidutils
|
from oslo_utils import uuidutils
|
||||||
|
|
||||||
@ -25,6 +26,12 @@ from aodh.tests import constants
|
|||||||
from aodh.tests.unit.evaluator import base
|
from aodh.tests.unit.evaluator import base
|
||||||
|
|
||||||
|
|
||||||
|
# NOTE(mmagr): Overriding PrometheusEvaluator setting to avoid
|
||||||
|
# complains during init.
|
||||||
|
os.environ['PROMETHEUS_HOST'] = '127.0.0.1'
|
||||||
|
os.environ['PROMETHEUS_PORT'] = '666'
|
||||||
|
|
||||||
|
|
||||||
class BaseCompositeEvaluate(base.TestEvaluatorBase):
|
class BaseCompositeEvaluate(base.TestEvaluatorBase):
|
||||||
EVALUATOR = composite.CompositeEvaluator
|
EVALUATOR = composite.CompositeEvaluator
|
||||||
|
|
||||||
|
@ -18,11 +18,14 @@ import fixtures
|
|||||||
import time
|
import time
|
||||||
from unittest import mock
|
from unittest import mock
|
||||||
|
|
||||||
|
from observabilityclient import prometheus_client
|
||||||
from oslo_config import fixture as fixture_config
|
from oslo_config import fixture as fixture_config
|
||||||
from stevedore import extension
|
from stevedore import extension
|
||||||
|
|
||||||
from aodh import evaluator
|
from aodh import evaluator
|
||||||
from aodh import service
|
from aodh import service
|
||||||
|
|
||||||
|
from aodh.evaluator import prometheus
|
||||||
from aodh.tests import base as tests_base
|
from aodh.tests import base as tests_base
|
||||||
|
|
||||||
|
|
||||||
@ -190,3 +193,59 @@ class TestAlarmEvaluationService(tests_base.BaseTestCase):
|
|||||||
target = svc.partition_coordinator.extract_my_subset
|
target = svc.partition_coordinator.extract_my_subset
|
||||||
self.assertEqual(0, target.call_count)
|
self.assertEqual(0, target.call_count)
|
||||||
self.assertEqual(0, self.threshold_eval.evaluate.call_count)
|
self.assertEqual(0, self.threshold_eval.evaluate.call_count)
|
||||||
|
|
||||||
|
|
||||||
|
class TestPrometheusEvaluator(tests_base.BaseTestCase):
|
||||||
|
def setUp(self):
|
||||||
|
super(TestPrometheusEvaluator, self).setUp()
|
||||||
|
conf = service.prepare_service(argv=[], config_files=[])
|
||||||
|
self.CONF = self.useFixture(fixture_config.Config(conf)).conf
|
||||||
|
|
||||||
|
def test_rule_evaluation(self):
|
||||||
|
metric_list = [
|
||||||
|
prometheus_client.PrometheusMetric({'metric': 'mtr',
|
||||||
|
'value': (0, 10)}),
|
||||||
|
prometheus_client.PrometheusMetric({'metric': 'mtr',
|
||||||
|
'value': (1, 15)}),
|
||||||
|
prometheus_client.PrometheusMetric({'metric': 'mtr',
|
||||||
|
'value': (2, 20)}),
|
||||||
|
prometheus_client.PrometheusMetric({'metric': 'mtr',
|
||||||
|
'value': (3, 25)}),
|
||||||
|
prometheus_client.PrometheusMetric({'metric': 'mtr',
|
||||||
|
'value': (4, 30)}),
|
||||||
|
prometheus_client.PrometheusMetric({'metric': 'mtr',
|
||||||
|
'value': (5, 15)}),
|
||||||
|
]
|
||||||
|
with mock.patch.object(prometheus.PrometheusEvaluator,
|
||||||
|
'_set_obsclient', return_value=None):
|
||||||
|
# mock Prometheus client
|
||||||
|
ev = prometheus.PrometheusEvaluator(self.CONF)
|
||||||
|
ev._get_metric_data = mock.Mock(return_value=metric_list)
|
||||||
|
|
||||||
|
# test transfer to alarm state
|
||||||
|
state, trend, stats, outside, reason = ev.evaluate_rule(
|
||||||
|
{'query': 'mtr', 'threshold': 9,
|
||||||
|
'comparison_operator': 'gt'})
|
||||||
|
self.assertEqual('alarm', state)
|
||||||
|
self.assertEqual(6, outside)
|
||||||
|
|
||||||
|
# test transfer to ok state
|
||||||
|
state, trend, stats, outside, reason = ev.evaluate_rule(
|
||||||
|
{'query': 'mtr', 'threshold': 31,
|
||||||
|
'comparison_operator': 'gt'})
|
||||||
|
self.assertEqual('ok', state)
|
||||||
|
self.assertEqual(0, outside)
|
||||||
|
|
||||||
|
# test trending to alarm state
|
||||||
|
state, trend, stats, outside, reason = ev.evaluate_rule(
|
||||||
|
{'query': 'mtr', 'threshold': 14,
|
||||||
|
'comparison_operator': 'gt'})
|
||||||
|
self.assertEqual('alarm', trend)
|
||||||
|
self.assertEqual(5, outside)
|
||||||
|
|
||||||
|
# test trending to ok state
|
||||||
|
state, trend, stats, outside, reason = ev.evaluate_rule(
|
||||||
|
{'query': 'mtr', 'threshold': 20,
|
||||||
|
'comparison_operator': 'gt'})
|
||||||
|
self.assertEqual('ok', trend)
|
||||||
|
self.assertEqual(2, outside)
|
||||||
|
@ -36,6 +36,7 @@ cachetools>=1.1.6
|
|||||||
cotyledon>=1.7.3
|
cotyledon>=1.7.3
|
||||||
keystoneauth1>=2.1
|
keystoneauth1>=2.1
|
||||||
debtcollector>=1.2.0 # Apache-2.0
|
debtcollector>=1.2.0 # Apache-2.0
|
||||||
|
python-observabilityclient>=0.0.4
|
||||||
python-octaviaclient>=1.8.0
|
python-octaviaclient>=1.8.0
|
||||||
python-dateutil>=2.8.2 # BSD
|
python-dateutil>=2.8.2 # BSD
|
||||||
python-heatclient>=1.17.0
|
python-heatclient>=1.17.0
|
||||||
|
@ -57,6 +57,7 @@ aodh.alarm.rule =
|
|||||||
event = aodh.api.controllers.v2.alarm_rules.event:AlarmEventRule
|
event = aodh.api.controllers.v2.alarm_rules.event:AlarmEventRule
|
||||||
composite = aodh.api.controllers.v2.alarm_rules.composite:composite_rule
|
composite = aodh.api.controllers.v2.alarm_rules.composite:composite_rule
|
||||||
loadbalancer_member_health = aodh.api.controllers.v2.alarm_rules.loadbalancer:LoadBalancerMemberHealthRule
|
loadbalancer_member_health = aodh.api.controllers.v2.alarm_rules.loadbalancer:LoadBalancerMemberHealthRule
|
||||||
|
prometheus = aodh.api.controllers.v2.alarm_rules.prometheus:PrometheusRule
|
||||||
|
|
||||||
aodh.evaluator =
|
aodh.evaluator =
|
||||||
gnocchi_resources_threshold = aodh.evaluator.gnocchi:GnocchiResourceThresholdEvaluator
|
gnocchi_resources_threshold = aodh.evaluator.gnocchi:GnocchiResourceThresholdEvaluator
|
||||||
@ -64,6 +65,7 @@ aodh.evaluator =
|
|||||||
gnocchi_aggregation_by_resources_threshold = aodh.evaluator.gnocchi:GnocchiAggregationResourcesThresholdEvaluator
|
gnocchi_aggregation_by_resources_threshold = aodh.evaluator.gnocchi:GnocchiAggregationResourcesThresholdEvaluator
|
||||||
composite = aodh.evaluator.composite:CompositeEvaluator
|
composite = aodh.evaluator.composite:CompositeEvaluator
|
||||||
loadbalancer_member_health = aodh.evaluator.loadbalancer:LoadBalancerMemberHealthEvaluator
|
loadbalancer_member_health = aodh.evaluator.loadbalancer:LoadBalancerMemberHealthEvaluator
|
||||||
|
prometheus = aodh.evaluator.prometheus:PrometheusEvaluator
|
||||||
|
|
||||||
aodh.notifier =
|
aodh.notifier =
|
||||||
log = aodh.notifier.log:LogAlarmNotifier
|
log = aodh.notifier.log:LogAlarmNotifier
|
||||||
|
Loading…
x
Reference in New Issue
Block a user