7851969319
We already have a really poor naming convention the metering API, and we decided to remove the counter term from everywhere. We can't fix the metering API since we relesed it, so let's fix the alarming one before it gets released and we have to handle a lot of complicated compatibility. :-( Change-Id: I3e3219d2eae0b72ad4a898630cacfd334e9390cc
241 lines
8.7 KiB
Python
241 lines
8.7 KiB
Python
# -*- encoding: utf-8 -*-
|
|
#
|
|
# Copyright © 2013 Red Hat, Inc
|
|
#
|
|
# Author: Eoghan Glynn <eglynn@redhat.com>
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import datetime
|
|
import operator
|
|
|
|
from oslo.config import cfg
|
|
|
|
from ceilometer.openstack.common import log
|
|
from ceilometer.openstack.common import timeutils
|
|
from ceilometerclient import client as ceiloclient
|
|
from ceilometer.openstack.common.gettextutils import _
|
|
|
|
LOG = log.getLogger(__name__)
|
|
|
|
COMPARATORS = {
|
|
'gt': operator.gt,
|
|
'lt': operator.lt,
|
|
'ge': operator.ge,
|
|
'le': operator.le,
|
|
'eq': operator.eq,
|
|
'ne': operator.ne,
|
|
}
|
|
|
|
UNKNOWN = 'insufficient data'
|
|
OK = 'ok'
|
|
ALARM = 'alarm'
|
|
|
|
|
|
class Evaluator(object):
|
|
"""This class implements the basic alarm threshold evaluation
|
|
logic.
|
|
"""
|
|
|
|
# the sliding evaluation window is extended to allow
|
|
# for reporting/ingestion lag
|
|
look_back = 1
|
|
|
|
# minimum number of datapoints within sliding window to
|
|
# avoid unknown state
|
|
quorum = 1
|
|
|
|
def __init__(self, notifier=None):
|
|
self.alarms = []
|
|
self.notifier = notifier
|
|
self.api_client = None
|
|
|
|
def assign_alarms(self, alarms):
|
|
"""Assign alarms to be evaluated."""
|
|
self.alarms = alarms
|
|
|
|
@property
|
|
def _client(self):
|
|
"""Construct or reuse an authenticated API client."""
|
|
if not self.api_client:
|
|
auth_config = cfg.CONF.service_credentials
|
|
creds = dict(
|
|
os_auth_url=auth_config.os_auth_url,
|
|
os_tenant_name=auth_config.os_tenant_name,
|
|
os_password=auth_config.os_password,
|
|
os_username=auth_config.os_username,
|
|
cacert=auth_config.os_cacert,
|
|
endpoint_type=auth_config.os_endpoint_type,
|
|
)
|
|
self.api_client = ceiloclient.get_client(2, **creds)
|
|
return self.api_client
|
|
|
|
@staticmethod
|
|
def _constraints(alarm):
|
|
"""Assert the constraints on the statistics query."""
|
|
constraints = []
|
|
for (field, value) in alarm.matching_metadata.iteritems():
|
|
constraints.append(dict(field=field, op='eq', value=value))
|
|
return constraints
|
|
|
|
@classmethod
|
|
def _bound_duration(cls, alarm, constraints):
|
|
"""Bound the duration of the statistics query."""
|
|
now = timeutils.utcnow()
|
|
window = (alarm.period *
|
|
(alarm.evaluation_periods + cls.look_back))
|
|
start = now - datetime.timedelta(seconds=window)
|
|
LOG.debug(_('query stats from %(start)s to '
|
|
'%(now)s') % {'start': start, 'now': now})
|
|
after = dict(field='timestamp', op='ge', value=start.isoformat())
|
|
before = dict(field='timestamp', op='le', value=now.isoformat())
|
|
constraints.extend([before, after])
|
|
return constraints
|
|
|
|
@staticmethod
|
|
def _sanitize(alarm, statistics):
|
|
"""Sanitize statistics.
|
|
Ultimately this will be the hook for the exclusion of chaotic
|
|
datapoints for example.
|
|
"""
|
|
LOG.debug(_('sanitize stats %s') % statistics)
|
|
# in practice statistics are always sorted by period start, not
|
|
# strictly required by the API though
|
|
statistics = statistics[:alarm.evaluation_periods]
|
|
LOG.debug(_('pruned statistics to %d') % len(statistics))
|
|
return statistics
|
|
|
|
def _statistics(self, alarm, query):
|
|
"""Retrieve statistics over the current window."""
|
|
LOG.debug(_('stats query %s') % query)
|
|
try:
|
|
return self._client.statistics.list(alarm.meter_name,
|
|
q=query,
|
|
period=alarm.period)
|
|
except Exception:
|
|
LOG.exception(_('alarm stats retrieval failed'))
|
|
return []
|
|
|
|
def _refresh(self, alarm, state, reason):
|
|
"""Refresh alarm state."""
|
|
try:
|
|
previous = alarm.state
|
|
if previous != state:
|
|
LOG.info(_('alarm %(id)s transitioning to %(state)s because '
|
|
'%(reason)s') % {'id': alarm.alarm_id,
|
|
'state': state,
|
|
'reason': reason})
|
|
|
|
self._client.alarms.update(alarm.alarm_id, **dict(state=state))
|
|
alarm.state = state
|
|
if self.notifier:
|
|
self.notifier.notify(alarm, previous, reason)
|
|
except Exception:
|
|
# retry will occur naturally on the next evaluation
|
|
# cycle (unless alarm state reverts in the meantime)
|
|
LOG.exception(_('alarm state update failed'))
|
|
|
|
def _sufficient(self, alarm, statistics):
|
|
"""Ensure there is sufficient data for evaluation,
|
|
transitioning to unknown otherwise.
|
|
"""
|
|
sufficient = len(statistics) >= self.quorum
|
|
if not sufficient and alarm.state != UNKNOWN:
|
|
reason = _('%d datapoints are unknown') % alarm.evaluation_periods
|
|
self._refresh(alarm, UNKNOWN, reason)
|
|
return sufficient
|
|
|
|
@staticmethod
|
|
def _reason(alarm, statistics, distilled, state):
|
|
"""Fabricate reason string."""
|
|
count = len(statistics)
|
|
disposition = 'inside' if state == OK else 'outside'
|
|
last = getattr(statistics[-1], alarm.statistic)
|
|
transition = alarm.state != state
|
|
if transition:
|
|
return (_('Transition to %(state)s due to %(count)d samples'
|
|
' %(disposition)s threshold, most recent: %(last)s') %
|
|
{'state': state, 'count': count,
|
|
'disposition': disposition, 'last': last})
|
|
return (_('Remaining as %(state)s due to %(count)d samples'
|
|
' %(disposition)s threshold, most recent: %(last)s') %
|
|
{'state': state, 'count': count,
|
|
'disposition': disposition, 'last': last})
|
|
|
|
def _transition(self, alarm, statistics, compared):
|
|
"""Transition alarm state if necessary.
|
|
|
|
The transition rules are currently hardcoded as:
|
|
|
|
- transitioning from a known state requires an unequivocal
|
|
set of datapoints
|
|
|
|
- transitioning from unknown is on the basis of the most
|
|
recent datapoint if equivocal
|
|
|
|
Ultimately this will be policy-driven.
|
|
"""
|
|
distilled = all(compared)
|
|
unequivocal = distilled or not any(compared)
|
|
unknown = alarm.state == UNKNOWN
|
|
continuous = alarm.repeat_actions
|
|
|
|
if unequivocal:
|
|
state = ALARM if distilled else OK
|
|
reason = self._reason(alarm, statistics, distilled, state)
|
|
if alarm.state != state or continuous:
|
|
self._refresh(alarm, state, reason)
|
|
elif unknown or continuous:
|
|
trending_state = ALARM if compared[-1] else OK
|
|
state = trending_state if unknown else alarm.state
|
|
reason = self._reason(alarm, statistics, distilled, state)
|
|
self._refresh(alarm, state, reason)
|
|
|
|
def evaluate(self):
|
|
"""Evaluate the alarms assigned to this evaluator."""
|
|
|
|
LOG.info(_('initiating evaluation cycle on %d alarms') %
|
|
len(self.alarms))
|
|
|
|
for alarm in self.alarms:
|
|
|
|
if not alarm.enabled:
|
|
LOG.debug(_('skipping alarm %s') % alarm.alarm_id)
|
|
continue
|
|
LOG.debug(_('evaluating alarm %s') % alarm.alarm_id)
|
|
|
|
query = self._bound_duration(
|
|
alarm,
|
|
self._constraints(alarm)
|
|
)
|
|
|
|
statistics = self._sanitize(
|
|
alarm,
|
|
self._statistics(alarm, query)
|
|
)
|
|
|
|
if self._sufficient(alarm, statistics):
|
|
|
|
def _compare(stat):
|
|
op = COMPARATORS[alarm.comparison_operator]
|
|
value = getattr(stat, alarm.statistic)
|
|
limit = alarm.threshold
|
|
LOG.debug(_('comparing value %(value)s against threshold'
|
|
' %(limit)s') %
|
|
{'value': value, 'limit': limit})
|
|
return op(value, limit)
|
|
|
|
self._transition(alarm,
|
|
statistics,
|
|
list(map(_compare, statistics)))
|