Add load balancer pool member evaluator
This patch is adding a new evaluator for the alarms of type 'loadbalancer_member_health' which evaluates the alarm by checking the operating status of the members in a given load balancer pool. A new config option 'member_creation_time' is introduced in order to ignore the members in initialization status. This is the first part of the auto-healing solution. Change-Id: I57b848e6dc6aa5e79af1c17dbf1a42a9f068f174
This commit is contained in:
parent
6aa6fd0626
commit
018b7d45fe
153
aodh/evaluator/loadbalancer.py
Normal file
153
aodh/evaluator/loadbalancer.py
Normal file
@ -0,0 +1,153 @@
|
||||
# Copyright 2019 Catalyst Cloud Ltd.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import datetime
|
||||
|
||||
from dateutil import parser
|
||||
from octaviaclient.api.v2 import octavia
|
||||
from oslo_config import cfg
|
||||
from oslo_log import log
|
||||
from oslo_utils import timeutils
|
||||
import six
|
||||
|
||||
from aodh import evaluator
|
||||
from aodh.evaluator import threshold
|
||||
from aodh import keystone_client as aodh_keystone
|
||||
|
||||
LOG = log.getLogger(__name__)
|
||||
|
||||
ALARM_TYPE = "loadbalancer_member_health"
|
||||
|
||||
OPTS = [
|
||||
cfg.IntOpt('member_creation_time',
|
||||
default=120,
|
||||
help='The time in seconds to wait for the load balancer '
|
||||
'member creation.'
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
class LoadBalancerMemberHealthEvaluator(evaluator.Evaluator):
|
||||
def __init__(self, conf):
|
||||
super(LoadBalancerMemberHealthEvaluator, self).__init__(conf)
|
||||
self._lb_client = None
|
||||
|
||||
@property
|
||||
def lb_client(self):
|
||||
if self._lb_client is None:
|
||||
endpoint = aodh_keystone.url_for(
|
||||
self.conf,
|
||||
service_type='load-balancer',
|
||||
interface="internal",
|
||||
region_name=self.conf.service_credentials.region_name
|
||||
)
|
||||
self._lb_client = octavia.OctaviaAPI(
|
||||
session=aodh_keystone.get_session(self.conf),
|
||||
service_type='load-balancer',
|
||||
endpoint=endpoint
|
||||
)
|
||||
|
||||
return self._lb_client
|
||||
|
||||
def _get_unhealthy_members(self, pool_id):
|
||||
"""Get number of unhealthy members in a pool.
|
||||
|
||||
The member(virutual machine) operating_status keeps ERROR after
|
||||
creation before the application is up and running inside, it should be
|
||||
ignored during the check.
|
||||
"""
|
||||
unhealthy_members = []
|
||||
|
||||
try:
|
||||
ret = self.lb_client.member_list(pool_id)
|
||||
except Exception as e:
|
||||
LOG.warning("Failed to communicate with load balancing service, "
|
||||
"error: %s", six.text_type(e))
|
||||
raise threshold.InsufficientDataError(
|
||||
'failed to communicate with load balancing service',
|
||||
[]
|
||||
)
|
||||
|
||||
if getattr(ret, 'status_code', None):
|
||||
# Some error happened
|
||||
raise threshold.InsufficientDataError(ret.content, [])
|
||||
|
||||
for m in ret.get("members", []):
|
||||
try:
|
||||
created_time = parser.parse(m['created_at'], ignoretz=True)
|
||||
except ValueError as e:
|
||||
LOG.warning('Failed to parse the member created time.')
|
||||
continue
|
||||
|
||||
now = timeutils.utcnow()
|
||||
t = self.conf.member_creation_time
|
||||
if now - created_time < datetime.timedelta(seconds=t):
|
||||
LOG.debug("Ignore member which was created within %ss", t)
|
||||
continue
|
||||
|
||||
if m["admin_state_up"] and m["operating_status"] == "ERROR":
|
||||
unhealthy_members.append(m)
|
||||
|
||||
return unhealthy_members
|
||||
|
||||
def _transition_alarm(self, alarm, new_state, members,
|
||||
count, unknown_reason, pool_id=None,
|
||||
stack_id=None, asg_id=None):
|
||||
transition = alarm.state != new_state
|
||||
last = members[-1] if members else None
|
||||
|
||||
reason_data = {
|
||||
'type': ALARM_TYPE,
|
||||
'count': count,
|
||||
'most_recent': last,
|
||||
'unhealthy_members': members,
|
||||
"pool_id": pool_id,
|
||||
"stack_id": stack_id,
|
||||
"asg_id": asg_id
|
||||
}
|
||||
|
||||
if transition:
|
||||
reason = ('Transition to %(state)s due to %(count)d members'
|
||||
' unhealthy, most recent: %(most_recent)s' %
|
||||
dict(state=new_state, count=count, most_recent=last))
|
||||
else:
|
||||
reason = ('Remaining as %(state)s' % dict(state=new_state))
|
||||
|
||||
reason = unknown_reason or reason
|
||||
|
||||
# Refresh and trigger alarm based on state transition.
|
||||
self._refresh(alarm, new_state, reason, reason_data)
|
||||
|
||||
def evaluate(self, alarm):
|
||||
if not self.within_time_constraint(alarm):
|
||||
LOG.debug('Attempted to evaluate alarm %s, but it is not '
|
||||
'within its time constraint.', alarm.alarm_id)
|
||||
return
|
||||
|
||||
LOG.debug("Evaluating %s rule alarm %s ...", ALARM_TYPE,
|
||||
alarm.alarm_id)
|
||||
|
||||
pool_id = alarm.rule["pool_id"]
|
||||
error_mems = []
|
||||
try:
|
||||
error_mems = self._get_unhealthy_members(pool_id)
|
||||
except threshold.InsufficientDataError as e:
|
||||
evaluation = (evaluator.UNKNOWN, e.statistics, 0, e.reason)
|
||||
else:
|
||||
state = evaluator.ALARM if len(error_mems) > 0 else evaluator.OK
|
||||
evaluation = (state, error_mems, len(error_mems), None)
|
||||
|
||||
self._transition_alarm(alarm, *evaluation, pool_id=pool_id,
|
||||
stack_id=alarm.rule.get("stack_id"),
|
||||
asg_id=alarm.rule.get("autoscaling_group_id"))
|
@ -88,6 +88,11 @@ def delete_trust_id(trust_id, auth_plugin):
|
||||
pass
|
||||
|
||||
|
||||
def url_for(conf, **kwargs):
|
||||
sess = get_session(conf)
|
||||
return sess.get_endpoint(**kwargs)
|
||||
|
||||
|
||||
OPTS = [
|
||||
cfg.StrOpt('region-name',
|
||||
default=os.environ.get('OS_REGION_NAME'),
|
||||
|
@ -22,6 +22,8 @@ import aodh.coordination
|
||||
import aodh.evaluator
|
||||
import aodh.evaluator.event
|
||||
import aodh.evaluator.gnocchi
|
||||
import aodh.evaluator.loadbalancer
|
||||
import aodh.evaluator.threshold
|
||||
import aodh.event
|
||||
import aodh.keystone_client
|
||||
import aodh.notifier.rest
|
||||
@ -37,6 +39,7 @@ def list_opts():
|
||||
aodh.evaluator.OPTS,
|
||||
aodh.evaluator.event.OPTS,
|
||||
aodh.evaluator.threshold.OPTS,
|
||||
aodh.evaluator.loadbalancer.OPTS,
|
||||
aodh.notifier.rest.OPTS,
|
||||
aodh.queue.OPTS,
|
||||
aodh.service.OPTS)),
|
||||
|
158
aodh/tests/unit/evaluator/test_loadbalancer.py
Normal file
158
aodh/tests/unit/evaluator/test_loadbalancer.py
Normal file
@ -0,0 +1,158 @@
|
||||
# Copyright 2019 Catalyst Cloud Ltd.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import datetime
|
||||
|
||||
import mock
|
||||
from oslo_utils import timeutils
|
||||
from oslo_utils import uuidutils
|
||||
|
||||
from aodh import evaluator
|
||||
from aodh.evaluator import loadbalancer
|
||||
from aodh.storage import models
|
||||
from aodh.tests import constants
|
||||
from aodh.tests.unit.evaluator import base
|
||||
|
||||
|
||||
@mock.patch('octaviaclient.api.v2.octavia.OctaviaAPI')
|
||||
@mock.patch('aodh.keystone_client.get_session')
|
||||
class TestLoadBalancerMemberHealthEvaluator(base.TestEvaluatorBase):
|
||||
EVALUATOR = loadbalancer.LoadBalancerMemberHealthEvaluator
|
||||
|
||||
def test_evaluate(self, mock_session, mock_octavia):
|
||||
alarm = models.Alarm(
|
||||
name='lb_member_alarm',
|
||||
description='lb_member_alarm',
|
||||
type=loadbalancer.ALARM_TYPE,
|
||||
enabled=True,
|
||||
user_id=uuidutils.generate_uuid(),
|
||||
project_id=uuidutils.generate_uuid(dashed=False),
|
||||
alarm_id=uuidutils.generate_uuid(),
|
||||
state='insufficient data',
|
||||
state_reason='insufficient data',
|
||||
state_timestamp=constants.MIN_DATETIME,
|
||||
timestamp=constants.MIN_DATETIME,
|
||||
insufficient_data_actions=[],
|
||||
ok_actions=[],
|
||||
alarm_actions=[],
|
||||
repeat_actions=False,
|
||||
time_constraints=[],
|
||||
severity='low',
|
||||
rule=dict(
|
||||
pool_id=uuidutils.generate_uuid(),
|
||||
stack_id=uuidutils.generate_uuid(),
|
||||
autoscaling_group_id=uuidutils.generate_uuid(),
|
||||
)
|
||||
)
|
||||
|
||||
mock_client = mock.MagicMock()
|
||||
mock_octavia.return_value = mock_client
|
||||
created_at = timeutils.utcnow() - datetime.timedelta(days=1)
|
||||
mock_client.member_list.return_value = {
|
||||
'members': [
|
||||
{
|
||||
'created_at': created_at.isoformat(),
|
||||
'admin_state_up': True,
|
||||
'operating_status': 'ERROR',
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
self.evaluator.evaluate(alarm)
|
||||
|
||||
self.assertEqual(evaluator.ALARM, alarm.state)
|
||||
|
||||
def test_evaluate_octavia_error(self, mock_session, mock_octavia):
|
||||
class Response(object):
|
||||
def __init__(self, status_code, content):
|
||||
self.status_code = status_code
|
||||
self.content = content
|
||||
|
||||
alarm = models.Alarm(
|
||||
name='lb_member_alarm',
|
||||
description='lb_member_alarm',
|
||||
type=loadbalancer.ALARM_TYPE,
|
||||
enabled=True,
|
||||
user_id=uuidutils.generate_uuid(),
|
||||
project_id=uuidutils.generate_uuid(dashed=False),
|
||||
alarm_id=uuidutils.generate_uuid(),
|
||||
state='insufficient data',
|
||||
state_reason='insufficient data',
|
||||
state_timestamp=constants.MIN_DATETIME,
|
||||
timestamp=constants.MIN_DATETIME,
|
||||
insufficient_data_actions=[],
|
||||
ok_actions=[],
|
||||
alarm_actions=[],
|
||||
repeat_actions=False,
|
||||
time_constraints=[],
|
||||
severity='low',
|
||||
rule=dict(
|
||||
pool_id=uuidutils.generate_uuid(),
|
||||
stack_id=uuidutils.generate_uuid(),
|
||||
autoscaling_group_id=uuidutils.generate_uuid(),
|
||||
)
|
||||
)
|
||||
|
||||
mock_client = mock.MagicMock()
|
||||
mock_octavia.return_value = mock_client
|
||||
msg = 'Pool NotFound'
|
||||
mock_client.member_list.return_value = Response(404, msg)
|
||||
|
||||
self.evaluator.evaluate(alarm)
|
||||
|
||||
self.assertEqual(evaluator.UNKNOWN, alarm.state)
|
||||
self.assertEqual(msg, alarm.state_reason)
|
||||
|
||||
def test_evaluate_alarm_to_ok(self, mock_session, mock_octavia):
|
||||
alarm = models.Alarm(
|
||||
name='lb_member_alarm',
|
||||
description='lb_member_alarm',
|
||||
type=loadbalancer.ALARM_TYPE,
|
||||
enabled=True,
|
||||
user_id=uuidutils.generate_uuid(),
|
||||
project_id=uuidutils.generate_uuid(dashed=False),
|
||||
alarm_id=uuidutils.generate_uuid(),
|
||||
state=evaluator.ALARM,
|
||||
state_reason='alarm',
|
||||
state_timestamp=constants.MIN_DATETIME,
|
||||
timestamp=constants.MIN_DATETIME,
|
||||
insufficient_data_actions=[],
|
||||
ok_actions=[],
|
||||
alarm_actions=[],
|
||||
repeat_actions=False,
|
||||
time_constraints=[],
|
||||
severity='low',
|
||||
rule=dict(
|
||||
pool_id=uuidutils.generate_uuid(),
|
||||
stack_id=uuidutils.generate_uuid(),
|
||||
autoscaling_group_id=uuidutils.generate_uuid(),
|
||||
)
|
||||
)
|
||||
|
||||
mock_client = mock.MagicMock()
|
||||
mock_octavia.return_value = mock_client
|
||||
created_at = timeutils.utcnow() - datetime.timedelta(days=1)
|
||||
mock_client.member_list.return_value = {
|
||||
'members': [
|
||||
{
|
||||
'created_at': created_at.isoformat(),
|
||||
'admin_state_up': True,
|
||||
'operating_status': 'ACTIVE',
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
self.evaluator.evaluate(alarm)
|
||||
|
||||
self.assertEqual(evaluator.OK, alarm.state)
|
@ -0,0 +1,4 @@
|
||||
features:
|
||||
- Added a new evaluator for the alarms of type
|
||||
``loadbalancer_member_health`` which evaluates the alarm by checking the
|
||||
operating status of the members in a given load balancer pool.
|
@ -35,3 +35,5 @@ cachetools>=1.1.6
|
||||
cotyledon
|
||||
keystoneauth1>=2.1
|
||||
debtcollector>=1.2.0 # Apache-2.0
|
||||
python-octaviaclient>=1.8.0
|
||||
python-dateutil # BSD
|
||||
|
@ -79,6 +79,7 @@ aodh.evaluator =
|
||||
gnocchi_aggregation_by_metrics_threshold = aodh.evaluator.gnocchi:GnocchiAggregationMetricsThresholdEvaluator
|
||||
gnocchi_aggregation_by_resources_threshold = aodh.evaluator.gnocchi:GnocchiAggregationResourcesThresholdEvaluator
|
||||
composite = aodh.evaluator.composite:CompositeEvaluator
|
||||
loadbalancer_member_health = aodh.evaluator.loadbalancer:LoadBalancerMemberHealthEvaluator
|
||||
|
||||
aodh.notifier =
|
||||
log = aodh.notifier.log:LogAlarmNotifier
|
||||
|
Loading…
x
Reference in New Issue
Block a user