aodh/ceilometer/alarm/threshold_evaluation.py
Julien Danjou 7851969319 alarm api: rename counter_name to meter_name
We already have a really poor naming convention the metering API, and we
decided to remove the counter term from everywhere. We can't fix the
metering API since we relesed it, so let's fix the alarming one before
it gets released and we have to handle a lot of complicated
compatibility. :-(

Change-Id: I3e3219d2eae0b72ad4a898630cacfd334e9390cc
2013-09-04 14:06:40 +02:00

241 lines
8.7 KiB
Python

# -*- encoding: utf-8 -*-
#
# Copyright © 2013 Red Hat, Inc
#
# Author: Eoghan Glynn <eglynn@redhat.com>
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import datetime
import operator
from oslo.config import cfg
from ceilometer.openstack.common import log
from ceilometer.openstack.common import timeutils
from ceilometerclient import client as ceiloclient
from ceilometer.openstack.common.gettextutils import _
LOG = log.getLogger(__name__)
COMPARATORS = {
'gt': operator.gt,
'lt': operator.lt,
'ge': operator.ge,
'le': operator.le,
'eq': operator.eq,
'ne': operator.ne,
}
UNKNOWN = 'insufficient data'
OK = 'ok'
ALARM = 'alarm'
class Evaluator(object):
"""This class implements the basic alarm threshold evaluation
logic.
"""
# the sliding evaluation window is extended to allow
# for reporting/ingestion lag
look_back = 1
# minimum number of datapoints within sliding window to
# avoid unknown state
quorum = 1
def __init__(self, notifier=None):
self.alarms = []
self.notifier = notifier
self.api_client = None
def assign_alarms(self, alarms):
"""Assign alarms to be evaluated."""
self.alarms = alarms
@property
def _client(self):
"""Construct or reuse an authenticated API client."""
if not self.api_client:
auth_config = cfg.CONF.service_credentials
creds = dict(
os_auth_url=auth_config.os_auth_url,
os_tenant_name=auth_config.os_tenant_name,
os_password=auth_config.os_password,
os_username=auth_config.os_username,
cacert=auth_config.os_cacert,
endpoint_type=auth_config.os_endpoint_type,
)
self.api_client = ceiloclient.get_client(2, **creds)
return self.api_client
@staticmethod
def _constraints(alarm):
"""Assert the constraints on the statistics query."""
constraints = []
for (field, value) in alarm.matching_metadata.iteritems():
constraints.append(dict(field=field, op='eq', value=value))
return constraints
@classmethod
def _bound_duration(cls, alarm, constraints):
"""Bound the duration of the statistics query."""
now = timeutils.utcnow()
window = (alarm.period *
(alarm.evaluation_periods + cls.look_back))
start = now - datetime.timedelta(seconds=window)
LOG.debug(_('query stats from %(start)s to '
'%(now)s') % {'start': start, 'now': now})
after = dict(field='timestamp', op='ge', value=start.isoformat())
before = dict(field='timestamp', op='le', value=now.isoformat())
constraints.extend([before, after])
return constraints
@staticmethod
def _sanitize(alarm, statistics):
"""Sanitize statistics.
Ultimately this will be the hook for the exclusion of chaotic
datapoints for example.
"""
LOG.debug(_('sanitize stats %s') % statistics)
# in practice statistics are always sorted by period start, not
# strictly required by the API though
statistics = statistics[:alarm.evaluation_periods]
LOG.debug(_('pruned statistics to %d') % len(statistics))
return statistics
def _statistics(self, alarm, query):
"""Retrieve statistics over the current window."""
LOG.debug(_('stats query %s') % query)
try:
return self._client.statistics.list(alarm.meter_name,
q=query,
period=alarm.period)
except Exception:
LOG.exception(_('alarm stats retrieval failed'))
return []
def _refresh(self, alarm, state, reason):
"""Refresh alarm state."""
try:
previous = alarm.state
if previous != state:
LOG.info(_('alarm %(id)s transitioning to %(state)s because '
'%(reason)s') % {'id': alarm.alarm_id,
'state': state,
'reason': reason})
self._client.alarms.update(alarm.alarm_id, **dict(state=state))
alarm.state = state
if self.notifier:
self.notifier.notify(alarm, previous, reason)
except Exception:
# retry will occur naturally on the next evaluation
# cycle (unless alarm state reverts in the meantime)
LOG.exception(_('alarm state update failed'))
def _sufficient(self, alarm, statistics):
"""Ensure there is sufficient data for evaluation,
transitioning to unknown otherwise.
"""
sufficient = len(statistics) >= self.quorum
if not sufficient and alarm.state != UNKNOWN:
reason = _('%d datapoints are unknown') % alarm.evaluation_periods
self._refresh(alarm, UNKNOWN, reason)
return sufficient
@staticmethod
def _reason(alarm, statistics, distilled, state):
"""Fabricate reason string."""
count = len(statistics)
disposition = 'inside' if state == OK else 'outside'
last = getattr(statistics[-1], alarm.statistic)
transition = alarm.state != state
if transition:
return (_('Transition to %(state)s due to %(count)d samples'
' %(disposition)s threshold, most recent: %(last)s') %
{'state': state, 'count': count,
'disposition': disposition, 'last': last})
return (_('Remaining as %(state)s due to %(count)d samples'
' %(disposition)s threshold, most recent: %(last)s') %
{'state': state, 'count': count,
'disposition': disposition, 'last': last})
def _transition(self, alarm, statistics, compared):
"""Transition alarm state if necessary.
The transition rules are currently hardcoded as:
- transitioning from a known state requires an unequivocal
set of datapoints
- transitioning from unknown is on the basis of the most
recent datapoint if equivocal
Ultimately this will be policy-driven.
"""
distilled = all(compared)
unequivocal = distilled or not any(compared)
unknown = alarm.state == UNKNOWN
continuous = alarm.repeat_actions
if unequivocal:
state = ALARM if distilled else OK
reason = self._reason(alarm, statistics, distilled, state)
if alarm.state != state or continuous:
self._refresh(alarm, state, reason)
elif unknown or continuous:
trending_state = ALARM if compared[-1] else OK
state = trending_state if unknown else alarm.state
reason = self._reason(alarm, statistics, distilled, state)
self._refresh(alarm, state, reason)
def evaluate(self):
"""Evaluate the alarms assigned to this evaluator."""
LOG.info(_('initiating evaluation cycle on %d alarms') %
len(self.alarms))
for alarm in self.alarms:
if not alarm.enabled:
LOG.debug(_('skipping alarm %s') % alarm.alarm_id)
continue
LOG.debug(_('evaluating alarm %s') % alarm.alarm_id)
query = self._bound_duration(
alarm,
self._constraints(alarm)
)
statistics = self._sanitize(
alarm,
self._statistics(alarm, query)
)
if self._sufficient(alarm, statistics):
def _compare(stat):
op = COMPARATORS[alarm.comparison_operator]
value = getattr(stat, alarm.statistic)
limit = alarm.threshold
LOG.debug(_('comparing value %(value)s against threshold'
' %(limit)s') %
{'value': value, 'limit': limit})
return op(value, limit)
self._transition(alarm,
statistics,
list(map(_compare, statistics)))