Add load balancer pool member evaluator

This patch is adding a new evaluator for the alarms of type
'loadbalancer_member_health' which evaluates the alarm by checking
the operating status of the members in a given load balancer pool.

A new config option 'member_creation_time' is introduced in order to
ignore the members in initialization status.

This is the first part of the auto-healing solution.

Change-Id: I57b848e6dc6aa5e79af1c17dbf1a42a9f068f174
This commit is contained in:
Lingxian Kong 2019-04-15 22:58:27 +12:00
parent 6aa6fd0626
commit 018b7d45fe
7 changed files with 326 additions and 0 deletions

View File

@ -0,0 +1,153 @@
# Copyright 2019 Catalyst Cloud Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import datetime
from dateutil import parser
from octaviaclient.api.v2 import octavia
from oslo_config import cfg
from oslo_log import log
from oslo_utils import timeutils
import six
from aodh import evaluator
from aodh.evaluator import threshold
from aodh import keystone_client as aodh_keystone
LOG = log.getLogger(__name__)
ALARM_TYPE = "loadbalancer_member_health"
OPTS = [
cfg.IntOpt('member_creation_time',
default=120,
help='The time in seconds to wait for the load balancer '
'member creation.'
),
]
class LoadBalancerMemberHealthEvaluator(evaluator.Evaluator):
def __init__(self, conf):
super(LoadBalancerMemberHealthEvaluator, self).__init__(conf)
self._lb_client = None
@property
def lb_client(self):
if self._lb_client is None:
endpoint = aodh_keystone.url_for(
self.conf,
service_type='load-balancer',
interface="internal",
region_name=self.conf.service_credentials.region_name
)
self._lb_client = octavia.OctaviaAPI(
session=aodh_keystone.get_session(self.conf),
service_type='load-balancer',
endpoint=endpoint
)
return self._lb_client
def _get_unhealthy_members(self, pool_id):
"""Get number of unhealthy members in a pool.
The member(virutual machine) operating_status keeps ERROR after
creation before the application is up and running inside, it should be
ignored during the check.
"""
unhealthy_members = []
try:
ret = self.lb_client.member_list(pool_id)
except Exception as e:
LOG.warning("Failed to communicate with load balancing service, "
"error: %s", six.text_type(e))
raise threshold.InsufficientDataError(
'failed to communicate with load balancing service',
[]
)
if getattr(ret, 'status_code', None):
# Some error happened
raise threshold.InsufficientDataError(ret.content, [])
for m in ret.get("members", []):
try:
created_time = parser.parse(m['created_at'], ignoretz=True)
except ValueError as e:
LOG.warning('Failed to parse the member created time.')
continue
now = timeutils.utcnow()
t = self.conf.member_creation_time
if now - created_time < datetime.timedelta(seconds=t):
LOG.debug("Ignore member which was created within %ss", t)
continue
if m["admin_state_up"] and m["operating_status"] == "ERROR":
unhealthy_members.append(m)
return unhealthy_members
def _transition_alarm(self, alarm, new_state, members,
count, unknown_reason, pool_id=None,
stack_id=None, asg_id=None):
transition = alarm.state != new_state
last = members[-1] if members else None
reason_data = {
'type': ALARM_TYPE,
'count': count,
'most_recent': last,
'unhealthy_members': members,
"pool_id": pool_id,
"stack_id": stack_id,
"asg_id": asg_id
}
if transition:
reason = ('Transition to %(state)s due to %(count)d members'
' unhealthy, most recent: %(most_recent)s' %
dict(state=new_state, count=count, most_recent=last))
else:
reason = ('Remaining as %(state)s' % dict(state=new_state))
reason = unknown_reason or reason
# Refresh and trigger alarm based on state transition.
self._refresh(alarm, new_state, reason, reason_data)
def evaluate(self, alarm):
if not self.within_time_constraint(alarm):
LOG.debug('Attempted to evaluate alarm %s, but it is not '
'within its time constraint.', alarm.alarm_id)
return
LOG.debug("Evaluating %s rule alarm %s ...", ALARM_TYPE,
alarm.alarm_id)
pool_id = alarm.rule["pool_id"]
error_mems = []
try:
error_mems = self._get_unhealthy_members(pool_id)
except threshold.InsufficientDataError as e:
evaluation = (evaluator.UNKNOWN, e.statistics, 0, e.reason)
else:
state = evaluator.ALARM if len(error_mems) > 0 else evaluator.OK
evaluation = (state, error_mems, len(error_mems), None)
self._transition_alarm(alarm, *evaluation, pool_id=pool_id,
stack_id=alarm.rule.get("stack_id"),
asg_id=alarm.rule.get("autoscaling_group_id"))

View File

@ -88,6 +88,11 @@ def delete_trust_id(trust_id, auth_plugin):
pass
def url_for(conf, **kwargs):
sess = get_session(conf)
return sess.get_endpoint(**kwargs)
OPTS = [
cfg.StrOpt('region-name',
default=os.environ.get('OS_REGION_NAME'),

View File

@ -22,6 +22,8 @@ import aodh.coordination
import aodh.evaluator
import aodh.evaluator.event
import aodh.evaluator.gnocchi
import aodh.evaluator.loadbalancer
import aodh.evaluator.threshold
import aodh.event
import aodh.keystone_client
import aodh.notifier.rest
@ -37,6 +39,7 @@ def list_opts():
aodh.evaluator.OPTS,
aodh.evaluator.event.OPTS,
aodh.evaluator.threshold.OPTS,
aodh.evaluator.loadbalancer.OPTS,
aodh.notifier.rest.OPTS,
aodh.queue.OPTS,
aodh.service.OPTS)),

View File

@ -0,0 +1,158 @@
# Copyright 2019 Catalyst Cloud Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import datetime
import mock
from oslo_utils import timeutils
from oslo_utils import uuidutils
from aodh import evaluator
from aodh.evaluator import loadbalancer
from aodh.storage import models
from aodh.tests import constants
from aodh.tests.unit.evaluator import base
@mock.patch('octaviaclient.api.v2.octavia.OctaviaAPI')
@mock.patch('aodh.keystone_client.get_session')
class TestLoadBalancerMemberHealthEvaluator(base.TestEvaluatorBase):
EVALUATOR = loadbalancer.LoadBalancerMemberHealthEvaluator
def test_evaluate(self, mock_session, mock_octavia):
alarm = models.Alarm(
name='lb_member_alarm',
description='lb_member_alarm',
type=loadbalancer.ALARM_TYPE,
enabled=True,
user_id=uuidutils.generate_uuid(),
project_id=uuidutils.generate_uuid(dashed=False),
alarm_id=uuidutils.generate_uuid(),
state='insufficient data',
state_reason='insufficient data',
state_timestamp=constants.MIN_DATETIME,
timestamp=constants.MIN_DATETIME,
insufficient_data_actions=[],
ok_actions=[],
alarm_actions=[],
repeat_actions=False,
time_constraints=[],
severity='low',
rule=dict(
pool_id=uuidutils.generate_uuid(),
stack_id=uuidutils.generate_uuid(),
autoscaling_group_id=uuidutils.generate_uuid(),
)
)
mock_client = mock.MagicMock()
mock_octavia.return_value = mock_client
created_at = timeutils.utcnow() - datetime.timedelta(days=1)
mock_client.member_list.return_value = {
'members': [
{
'created_at': created_at.isoformat(),
'admin_state_up': True,
'operating_status': 'ERROR',
}
]
}
self.evaluator.evaluate(alarm)
self.assertEqual(evaluator.ALARM, alarm.state)
def test_evaluate_octavia_error(self, mock_session, mock_octavia):
class Response(object):
def __init__(self, status_code, content):
self.status_code = status_code
self.content = content
alarm = models.Alarm(
name='lb_member_alarm',
description='lb_member_alarm',
type=loadbalancer.ALARM_TYPE,
enabled=True,
user_id=uuidutils.generate_uuid(),
project_id=uuidutils.generate_uuid(dashed=False),
alarm_id=uuidutils.generate_uuid(),
state='insufficient data',
state_reason='insufficient data',
state_timestamp=constants.MIN_DATETIME,
timestamp=constants.MIN_DATETIME,
insufficient_data_actions=[],
ok_actions=[],
alarm_actions=[],
repeat_actions=False,
time_constraints=[],
severity='low',
rule=dict(
pool_id=uuidutils.generate_uuid(),
stack_id=uuidutils.generate_uuid(),
autoscaling_group_id=uuidutils.generate_uuid(),
)
)
mock_client = mock.MagicMock()
mock_octavia.return_value = mock_client
msg = 'Pool NotFound'
mock_client.member_list.return_value = Response(404, msg)
self.evaluator.evaluate(alarm)
self.assertEqual(evaluator.UNKNOWN, alarm.state)
self.assertEqual(msg, alarm.state_reason)
def test_evaluate_alarm_to_ok(self, mock_session, mock_octavia):
alarm = models.Alarm(
name='lb_member_alarm',
description='lb_member_alarm',
type=loadbalancer.ALARM_TYPE,
enabled=True,
user_id=uuidutils.generate_uuid(),
project_id=uuidutils.generate_uuid(dashed=False),
alarm_id=uuidutils.generate_uuid(),
state=evaluator.ALARM,
state_reason='alarm',
state_timestamp=constants.MIN_DATETIME,
timestamp=constants.MIN_DATETIME,
insufficient_data_actions=[],
ok_actions=[],
alarm_actions=[],
repeat_actions=False,
time_constraints=[],
severity='low',
rule=dict(
pool_id=uuidutils.generate_uuid(),
stack_id=uuidutils.generate_uuid(),
autoscaling_group_id=uuidutils.generate_uuid(),
)
)
mock_client = mock.MagicMock()
mock_octavia.return_value = mock_client
created_at = timeutils.utcnow() - datetime.timedelta(days=1)
mock_client.member_list.return_value = {
'members': [
{
'created_at': created_at.isoformat(),
'admin_state_up': True,
'operating_status': 'ACTIVE',
}
]
}
self.evaluator.evaluate(alarm)
self.assertEqual(evaluator.OK, alarm.state)

View File

@ -0,0 +1,4 @@
features:
- Added a new evaluator for the alarms of type
``loadbalancer_member_health`` which evaluates the alarm by checking the
operating status of the members in a given load balancer pool.

View File

@ -35,3 +35,5 @@ cachetools>=1.1.6
cotyledon
keystoneauth1>=2.1
debtcollector>=1.2.0 # Apache-2.0
python-octaviaclient>=1.8.0
python-dateutil # BSD

View File

@ -79,6 +79,7 @@ aodh.evaluator =
gnocchi_aggregation_by_metrics_threshold = aodh.evaluator.gnocchi:GnocchiAggregationMetricsThresholdEvaluator
gnocchi_aggregation_by_resources_threshold = aodh.evaluator.gnocchi:GnocchiAggregationResourcesThresholdEvaluator
composite = aodh.evaluator.composite:CompositeEvaluator
loadbalancer_member_health = aodh.evaluator.loadbalancer:LoadBalancerMemberHealthEvaluator
aodh.notifier =
log = aodh.notifier.log:LogAlarmNotifier