diff --git a/aodh/evaluator/loadbalancer.py b/aodh/evaluator/loadbalancer.py new file mode 100644 index 000000000..6a4770534 --- /dev/null +++ b/aodh/evaluator/loadbalancer.py @@ -0,0 +1,153 @@ +# Copyright 2019 Catalyst Cloud Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime + +from dateutil import parser +from octaviaclient.api.v2 import octavia +from oslo_config import cfg +from oslo_log import log +from oslo_utils import timeutils +import six + +from aodh import evaluator +from aodh.evaluator import threshold +from aodh import keystone_client as aodh_keystone + +LOG = log.getLogger(__name__) + +ALARM_TYPE = "loadbalancer_member_health" + +OPTS = [ + cfg.IntOpt('member_creation_time', + default=120, + help='The time in seconds to wait for the load balancer ' + 'member creation.' + ), +] + + +class LoadBalancerMemberHealthEvaluator(evaluator.Evaluator): + def __init__(self, conf): + super(LoadBalancerMemberHealthEvaluator, self).__init__(conf) + self._lb_client = None + + @property + def lb_client(self): + if self._lb_client is None: + endpoint = aodh_keystone.url_for( + self.conf, + service_type='load-balancer', + interface="internal", + region_name=self.conf.service_credentials.region_name + ) + self._lb_client = octavia.OctaviaAPI( + session=aodh_keystone.get_session(self.conf), + service_type='load-balancer', + endpoint=endpoint + ) + + return self._lb_client + + def _get_unhealthy_members(self, pool_id): + """Get number of unhealthy members in a pool. + + The member(virutual machine) operating_status keeps ERROR after + creation before the application is up and running inside, it should be + ignored during the check. + """ + unhealthy_members = [] + + try: + ret = self.lb_client.member_list(pool_id) + except Exception as e: + LOG.warning("Failed to communicate with load balancing service, " + "error: %s", six.text_type(e)) + raise threshold.InsufficientDataError( + 'failed to communicate with load balancing service', + [] + ) + + if getattr(ret, 'status_code', None): + # Some error happened + raise threshold.InsufficientDataError(ret.content, []) + + for m in ret.get("members", []): + try: + created_time = parser.parse(m['created_at'], ignoretz=True) + except ValueError as e: + LOG.warning('Failed to parse the member created time.') + continue + + now = timeutils.utcnow() + t = self.conf.member_creation_time + if now - created_time < datetime.timedelta(seconds=t): + LOG.debug("Ignore member which was created within %ss", t) + continue + + if m["admin_state_up"] and m["operating_status"] == "ERROR": + unhealthy_members.append(m) + + return unhealthy_members + + def _transition_alarm(self, alarm, new_state, members, + count, unknown_reason, pool_id=None, + stack_id=None, asg_id=None): + transition = alarm.state != new_state + last = members[-1] if members else None + + reason_data = { + 'type': ALARM_TYPE, + 'count': count, + 'most_recent': last, + 'unhealthy_members': members, + "pool_id": pool_id, + "stack_id": stack_id, + "asg_id": asg_id + } + + if transition: + reason = ('Transition to %(state)s due to %(count)d members' + ' unhealthy, most recent: %(most_recent)s' % + dict(state=new_state, count=count, most_recent=last)) + else: + reason = ('Remaining as %(state)s' % dict(state=new_state)) + + reason = unknown_reason or reason + + # Refresh and trigger alarm based on state transition. + self._refresh(alarm, new_state, reason, reason_data) + + def evaluate(self, alarm): + if not self.within_time_constraint(alarm): + LOG.debug('Attempted to evaluate alarm %s, but it is not ' + 'within its time constraint.', alarm.alarm_id) + return + + LOG.debug("Evaluating %s rule alarm %s ...", ALARM_TYPE, + alarm.alarm_id) + + pool_id = alarm.rule["pool_id"] + error_mems = [] + try: + error_mems = self._get_unhealthy_members(pool_id) + except threshold.InsufficientDataError as e: + evaluation = (evaluator.UNKNOWN, e.statistics, 0, e.reason) + else: + state = evaluator.ALARM if len(error_mems) > 0 else evaluator.OK + evaluation = (state, error_mems, len(error_mems), None) + + self._transition_alarm(alarm, *evaluation, pool_id=pool_id, + stack_id=alarm.rule.get("stack_id"), + asg_id=alarm.rule.get("autoscaling_group_id")) diff --git a/aodh/keystone_client.py b/aodh/keystone_client.py index 86fec6439..3cf10ca7d 100644 --- a/aodh/keystone_client.py +++ b/aodh/keystone_client.py @@ -88,6 +88,11 @@ def delete_trust_id(trust_id, auth_plugin): pass +def url_for(conf, **kwargs): + sess = get_session(conf) + return sess.get_endpoint(**kwargs) + + OPTS = [ cfg.StrOpt('region-name', default=os.environ.get('OS_REGION_NAME'), diff --git a/aodh/opts.py b/aodh/opts.py index 2f434e6fd..1960f8257 100644 --- a/aodh/opts.py +++ b/aodh/opts.py @@ -22,6 +22,8 @@ import aodh.coordination import aodh.evaluator import aodh.evaluator.event import aodh.evaluator.gnocchi +import aodh.evaluator.loadbalancer +import aodh.evaluator.threshold import aodh.event import aodh.keystone_client import aodh.notifier.rest @@ -37,6 +39,7 @@ def list_opts(): aodh.evaluator.OPTS, aodh.evaluator.event.OPTS, aodh.evaluator.threshold.OPTS, + aodh.evaluator.loadbalancer.OPTS, aodh.notifier.rest.OPTS, aodh.queue.OPTS, aodh.service.OPTS)), diff --git a/aodh/tests/unit/evaluator/test_loadbalancer.py b/aodh/tests/unit/evaluator/test_loadbalancer.py new file mode 100644 index 000000000..567bc135b --- /dev/null +++ b/aodh/tests/unit/evaluator/test_loadbalancer.py @@ -0,0 +1,158 @@ +# Copyright 2019 Catalyst Cloud Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime + +import mock +from oslo_utils import timeutils +from oslo_utils import uuidutils + +from aodh import evaluator +from aodh.evaluator import loadbalancer +from aodh.storage import models +from aodh.tests import constants +from aodh.tests.unit.evaluator import base + + +@mock.patch('octaviaclient.api.v2.octavia.OctaviaAPI') +@mock.patch('aodh.keystone_client.get_session') +class TestLoadBalancerMemberHealthEvaluator(base.TestEvaluatorBase): + EVALUATOR = loadbalancer.LoadBalancerMemberHealthEvaluator + + def test_evaluate(self, mock_session, mock_octavia): + alarm = models.Alarm( + name='lb_member_alarm', + description='lb_member_alarm', + type=loadbalancer.ALARM_TYPE, + enabled=True, + user_id=uuidutils.generate_uuid(), + project_id=uuidutils.generate_uuid(dashed=False), + alarm_id=uuidutils.generate_uuid(), + state='insufficient data', + state_reason='insufficient data', + state_timestamp=constants.MIN_DATETIME, + timestamp=constants.MIN_DATETIME, + insufficient_data_actions=[], + ok_actions=[], + alarm_actions=[], + repeat_actions=False, + time_constraints=[], + severity='low', + rule=dict( + pool_id=uuidutils.generate_uuid(), + stack_id=uuidutils.generate_uuid(), + autoscaling_group_id=uuidutils.generate_uuid(), + ) + ) + + mock_client = mock.MagicMock() + mock_octavia.return_value = mock_client + created_at = timeutils.utcnow() - datetime.timedelta(days=1) + mock_client.member_list.return_value = { + 'members': [ + { + 'created_at': created_at.isoformat(), + 'admin_state_up': True, + 'operating_status': 'ERROR', + } + ] + } + + self.evaluator.evaluate(alarm) + + self.assertEqual(evaluator.ALARM, alarm.state) + + def test_evaluate_octavia_error(self, mock_session, mock_octavia): + class Response(object): + def __init__(self, status_code, content): + self.status_code = status_code + self.content = content + + alarm = models.Alarm( + name='lb_member_alarm', + description='lb_member_alarm', + type=loadbalancer.ALARM_TYPE, + enabled=True, + user_id=uuidutils.generate_uuid(), + project_id=uuidutils.generate_uuid(dashed=False), + alarm_id=uuidutils.generate_uuid(), + state='insufficient data', + state_reason='insufficient data', + state_timestamp=constants.MIN_DATETIME, + timestamp=constants.MIN_DATETIME, + insufficient_data_actions=[], + ok_actions=[], + alarm_actions=[], + repeat_actions=False, + time_constraints=[], + severity='low', + rule=dict( + pool_id=uuidutils.generate_uuid(), + stack_id=uuidutils.generate_uuid(), + autoscaling_group_id=uuidutils.generate_uuid(), + ) + ) + + mock_client = mock.MagicMock() + mock_octavia.return_value = mock_client + msg = 'Pool NotFound' + mock_client.member_list.return_value = Response(404, msg) + + self.evaluator.evaluate(alarm) + + self.assertEqual(evaluator.UNKNOWN, alarm.state) + self.assertEqual(msg, alarm.state_reason) + + def test_evaluate_alarm_to_ok(self, mock_session, mock_octavia): + alarm = models.Alarm( + name='lb_member_alarm', + description='lb_member_alarm', + type=loadbalancer.ALARM_TYPE, + enabled=True, + user_id=uuidutils.generate_uuid(), + project_id=uuidutils.generate_uuid(dashed=False), + alarm_id=uuidutils.generate_uuid(), + state=evaluator.ALARM, + state_reason='alarm', + state_timestamp=constants.MIN_DATETIME, + timestamp=constants.MIN_DATETIME, + insufficient_data_actions=[], + ok_actions=[], + alarm_actions=[], + repeat_actions=False, + time_constraints=[], + severity='low', + rule=dict( + pool_id=uuidutils.generate_uuid(), + stack_id=uuidutils.generate_uuid(), + autoscaling_group_id=uuidutils.generate_uuid(), + ) + ) + + mock_client = mock.MagicMock() + mock_octavia.return_value = mock_client + created_at = timeutils.utcnow() - datetime.timedelta(days=1) + mock_client.member_list.return_value = { + 'members': [ + { + 'created_at': created_at.isoformat(), + 'admin_state_up': True, + 'operating_status': 'ACTIVE', + } + ] + } + + self.evaluator.evaluate(alarm) + + self.assertEqual(evaluator.OK, alarm.state) diff --git a/releasenotes/notes/loadbalancer-evaluator-85732c5e5f6e11e9.yaml b/releasenotes/notes/loadbalancer-evaluator-85732c5e5f6e11e9.yaml new file mode 100644 index 000000000..5b0e3bfb5 --- /dev/null +++ b/releasenotes/notes/loadbalancer-evaluator-85732c5e5f6e11e9.yaml @@ -0,0 +1,4 @@ +features: + - Added a new evaluator for the alarms of type + ``loadbalancer_member_health`` which evaluates the alarm by checking the + operating status of the members in a given load balancer pool. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index c5b31a67a..5745eeb2e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,3 +35,5 @@ cachetools>=1.1.6 cotyledon keystoneauth1>=2.1 debtcollector>=1.2.0 # Apache-2.0 +python-octaviaclient>=1.8.0 +python-dateutil # BSD diff --git a/setup.cfg b/setup.cfg index 513317bdd..7e4f91a1b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -79,6 +79,7 @@ aodh.evaluator = gnocchi_aggregation_by_metrics_threshold = aodh.evaluator.gnocchi:GnocchiAggregationMetricsThresholdEvaluator gnocchi_aggregation_by_resources_threshold = aodh.evaluator.gnocchi:GnocchiAggregationResourcesThresholdEvaluator composite = aodh.evaluator.composite:CompositeEvaluator + loadbalancer_member_health = aodh.evaluator.loadbalancer:LoadBalancerMemberHealthEvaluator aodh.notifier = log = aodh.notifier.log:LogAlarmNotifier