99373ff459
A skeleton service for singleton threshold evaluation of alarms (i.e. with trivial global partitioning). Change-Id: I6081ac92b04ecffa1fb8783c36e9fe12004c2129
220 lines
7.6 KiB
Python
220 lines
7.6 KiB
Python
# -*- encoding: utf-8 -*-
|
|
#
|
|
# Copyright © 2013 Red Hat, Inc
|
|
#
|
|
# Author: Eoghan Glynn <eglynn@redhat.com>
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import datetime
|
|
import operator
|
|
|
|
from oslo.config import cfg
|
|
|
|
from ceilometer.openstack.common import log
|
|
from ceilometerclient import client as ceiloclient
|
|
|
|
LOG = log.getLogger(__name__)
|
|
|
|
COMPARATORS = {
|
|
'gt': operator.gt,
|
|
'lt': operator.lt,
|
|
'ge': operator.ge,
|
|
'le': operator.le,
|
|
'eq': operator.eq,
|
|
'ne': operator.ne,
|
|
}
|
|
|
|
UNKNOWN = 'insufficient data'
|
|
OK = 'ok'
|
|
ALARM = 'alarm'
|
|
|
|
|
|
class Evaluator(object):
|
|
"""This class implements the basic alarm threshold evaluation
|
|
logic.
|
|
"""
|
|
|
|
# the sliding evaluation window is extended to allow
|
|
# for reporting/ingestion lag
|
|
look_back = 1
|
|
|
|
# minimum number of datapoints within sliding window to
|
|
# avoid unknown state
|
|
quorum = 1
|
|
|
|
def __init__(self, notifier=None):
|
|
self.alarms = []
|
|
self.notifier = notifier
|
|
self.api_client = None
|
|
|
|
def assign_alarms(self, alarms):
|
|
"""Assign alarms to be evaluated."""
|
|
self.alarms = alarms
|
|
|
|
@property
|
|
def _client(self):
|
|
"""Construct or reuse an authenticated API client."""
|
|
if not self.api_client:
|
|
auth_config = cfg.CONF.service_credentials
|
|
creds = dict(
|
|
os_auth_url=auth_config.os_auth_url,
|
|
os_tenant_name=auth_config.os_tenant_name,
|
|
os_password=auth_config.os_password,
|
|
os_username=auth_config.os_username
|
|
)
|
|
self.api_client = ceiloclient.get_client(2, **creds)
|
|
return self.api_client
|
|
|
|
@staticmethod
|
|
def _constraints(alarm):
|
|
"""Assert the constraints on the statistics query."""
|
|
constraints = []
|
|
for (field, value) in alarm.matching_metadata.iteritems():
|
|
constraints.append(dict(field=field, op='eq', value=value))
|
|
return constraints
|
|
|
|
@classmethod
|
|
def _bound_duration(cls, alarm, constraints):
|
|
"""Bound the duration of the statistics query."""
|
|
now = datetime.datetime.utcnow()
|
|
window = (alarm.period *
|
|
(alarm.evaluation_periods + cls.look_back))
|
|
start = now - datetime.timedelta(seconds=window)
|
|
LOG.debug(_('query stats from %(start)s to %(now)s') % locals())
|
|
after = dict(field='timestamp', op='ge', value=start.isoformat())
|
|
before = dict(field='timestamp', op='le', value=now.isoformat())
|
|
constraints.extend([before, after])
|
|
return constraints
|
|
|
|
@staticmethod
|
|
def _sanitize(alarm, statistics):
|
|
"""Sanitize statistics.
|
|
Ultimately this will be the hook for the exclusion of chaotic
|
|
datapoints for example.
|
|
"""
|
|
LOG.debug(_('sanitize stats %s') % statistics)
|
|
# in practice statistics are always sorted by period start, not
|
|
# strictly required by the API though
|
|
statistics = statistics[:alarm.evaluation_periods]
|
|
LOG.debug(_('pruned statistics to %d') % len(statistics))
|
|
return statistics
|
|
|
|
def _statistics(self, alarm, query):
|
|
"""Retrieve statistics over the current window."""
|
|
LOG.debug(_('stats query %s') % query)
|
|
try:
|
|
return self._client.statistics.list(alarm.counter_name,
|
|
q=query,
|
|
period=alarm.period)
|
|
except Exception:
|
|
LOG.exception(_('alarm stats retrieval failed'))
|
|
return []
|
|
|
|
def _update(self, alarm, state, reason):
|
|
"""Refresh alarm state."""
|
|
id = alarm.alarm_id
|
|
LOG.info(_('alarm %(id)s transitioning to %(state)s'
|
|
' because %(reason)s') % locals())
|
|
try:
|
|
self._client.alarms.update(id, **dict(state=state))
|
|
alarm.state = state
|
|
if self.notifier:
|
|
self.notifier.notify(alarm, state, reason)
|
|
except Exception:
|
|
# retry will occur naturally on the next evaluation
|
|
# cycle (unless alarm state reverts in the meantime)
|
|
LOG.exception(_('alarm state update failed'))
|
|
|
|
def _sufficient(self, alarm, statistics):
|
|
"""Ensure there is sufficient data for evaluation,
|
|
transitioning to unknown otherwise.
|
|
"""
|
|
sufficient = len(statistics) >= self.quorum
|
|
if not sufficient and alarm.state != UNKNOWN:
|
|
reason = _('%d datapoints are unknown') % alarm.evaluation_periods
|
|
self._update(alarm, UNKNOWN, reason)
|
|
return sufficient
|
|
|
|
@staticmethod
|
|
def _reason(alarm, statistics, distilled, state):
|
|
"""Fabricate reason string."""
|
|
count = len(statistics)
|
|
disposition = 'inside' if state == OK else 'outside'
|
|
last = getattr(statistics[-1], alarm.statistic)
|
|
return (_('Transition to %(state)s due to %(count)d samples'
|
|
' %(disposition)s threshold, most recent: %(last)s') %
|
|
locals())
|
|
|
|
def _transition(self, alarm, statistics, compared):
|
|
"""Transition alarm state if necessary.
|
|
|
|
The transition rules are currently hardcoded as:
|
|
|
|
- transitioning from a known state requires an unequivocal
|
|
set of datapoints
|
|
|
|
- transitioning from unknown is on the basis of the most
|
|
recent datapoint if equivocal
|
|
|
|
Ultimately this will be policy-driven.
|
|
"""
|
|
distilled = all(compared)
|
|
unequivocal = distilled or not any(compared)
|
|
if unequivocal:
|
|
state = ALARM if distilled else OK
|
|
if alarm.state != state:
|
|
reason = self._reason(alarm, statistics, distilled, state)
|
|
self._update(alarm, state, reason)
|
|
elif alarm.state == UNKNOWN:
|
|
state = ALARM if compared[-1] else OK
|
|
reason = self._reason(alarm, statistics, distilled, state)
|
|
self._update(alarm, state, reason)
|
|
|
|
def evaluate(self):
|
|
"""Evaluate the alarms assigned to this evaluator."""
|
|
|
|
LOG.info(_('initiating evaluation cycle on %d alarms') %
|
|
len(self.alarms))
|
|
|
|
for alarm in self.alarms:
|
|
|
|
if not alarm.enabled:
|
|
LOG.debug(_('skipping alarm %s') % alarm.alarm_id)
|
|
continue
|
|
LOG.debug(_('evaluating alarm %s') % alarm.alarm_id)
|
|
|
|
query = self._bound_duration(
|
|
alarm,
|
|
self._constraints(alarm)
|
|
)
|
|
|
|
statistics = self._sanitize(
|
|
alarm,
|
|
self._statistics(alarm, query)
|
|
)
|
|
|
|
if self._sufficient(alarm, statistics):
|
|
|
|
def _compare(stat):
|
|
op = COMPARATORS[alarm.comparison_operator]
|
|
value = getattr(stat, alarm.statistic)
|
|
limit = alarm.threshold
|
|
LOG.debug(_('comparing value %(value)s against threshold'
|
|
' %(limit)s') % locals())
|
|
return op(value, limit)
|
|
|
|
self._transition(alarm,
|
|
statistics,
|
|
list(map(_compare, statistics)))
|