Support Heat auto-healing notifier

The auto-healing notifier works together with loadbalancer_member_health
evaluator.

Presumably, the end user defines a Heat template which contains an
autoscaling group and all the members in the group are joined in an
Octavia load balancer in order to expose service to the outside, so that
when the stack scales up or scales down, Heat makes sure the new members
are joining the load balancer automatically and the old members are
removed.

However, this notifier deals with the situation that when some member
fails, the stack could be recovered by marking the given autoscaling
group member unhealthy, then update Heat stack in place.

Change-Id: I6e92d1fc2125e155bb5068ff2c14fa318b126442
This commit is contained in:
Lingxian Kong 2019-05-14 22:05:04 +12:00
parent 8742b0d540
commit a8285b564d
8 changed files with 238 additions and 0 deletions

View File

@ -15,6 +15,7 @@
import os import os
from heatclient import client as heatclient
from keystoneauth1 import exceptions as ka_exception from keystoneauth1 import exceptions as ka_exception
from keystoneauth1.identity.generic import password from keystoneauth1.identity.generic import password
from keystoneauth1 import loading as ka_loading from keystoneauth1 import loading as ka_loading
@ -93,6 +94,19 @@ def url_for(conf, **kwargs):
return sess.get_endpoint(**kwargs) return sess.get_endpoint(**kwargs)
def get_heat_client_from_trust(conf, trust_id):
ks_client = get_trusted_client(conf, trust_id)
sess = ks_client.session
endpoint = sess.get_endpoint(
service_type='orchestration',
interface="internal",
region_name=conf.service_credentials.region_name
)
return heatclient.Client("1", endpoint=endpoint, session=sess)
OPTS = [ OPTS = [
cfg.StrOpt('region-name', cfg.StrOpt('region-name',
default=os.environ.get('OS_REGION_NAME'), default=os.environ.get('OS_REGION_NAME'),

114
aodh/notifier/heat.py Normal file
View File

@ -0,0 +1,114 @@
# Copyright 2019 Catalyst Cloud Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from oslo_log import log
from oslo_utils import uuidutils
import six
from aodh import keystone_client as aodh_keystone
from aodh import notifier
LOG = log.getLogger(__name__)
class TrustHeatAlarmNotifier(notifier.AlarmNotifier):
"""Heat autohealing notifier.
The auto-healing notifier works together with loadbalancer_member_health
evaluator.
Presumably, the end user defines a Heat template which contains an
autoscaling group and all the members in the group are joined in an Octavia
load balancer in order to expose service to the outside, so that when the
stack scales up or scales down, Heat makes sure the new members are joining
the load balancer automatically and the old members are removed.
However, this notifier deals with the situation that when some member
fails, the stack could be recovered by marking the given autoscaling group
member unhealthy, then update Heat stack in place. In order to do that, the
notifier needs to know:
- Heat stack ID.
- Heat autoscaling group ID.
- The failed Octavia pool members.
The resource ID in the autoscaling group is saved in the Octavia member
tags. So, only Octavia stable/stein or later versions are supported.
"""
def __init__(self, conf):
super(TrustHeatAlarmNotifier, self).__init__(conf)
self.conf = conf
def notify(self, action, alarm_id, alarm_name, severity, previous, current,
reason, reason_data):
LOG.info(
"Notifying alarm %(alarm_name)s %(alarm_id)s of %(severity)s "
"priority from %(previous)s to %(current)s with action %(action)s"
" because %(reason)s." %
{'alarm_name': alarm_name,
'alarm_id': alarm_id,
'severity': severity,
'previous': previous,
'current': current,
'action': action.geturl(),
'reason': reason}
)
trust_id = action.username
stack_id = reason_data.get("stack_id")
asg_id = reason_data.get("asg_id")
if not stack_id or not asg_id:
LOG.warning(
"stack_id and asg_id must exist to notify alarm %s", alarm_id
)
return
resources = []
unhealthy_members = reason_data.get("unhealthy_members", [])
for member in unhealthy_members:
for tag in member.get("tags", []):
if uuidutils.is_uuid_like(tag):
resources.append(tag)
if resources:
try:
heat_client = aodh_keystone.get_heat_client_from_trust(
self.conf, trust_id
)
for res in resources:
heat_client.resources.mark_unhealthy(
asg_id,
res,
True,
"unhealthy load balancer member"
)
LOG.info(
"Heat resource %(resource_id)s is marked as unhealthy "
"for alarm %(alarm_id)s",
{"resource_id": res, "alarm_id": alarm_id}
)
heat_client.stacks.update(stack_id, existing=True)
LOG.info(
"Heat stack %(stack_id)s is updated for alarm "
"%(alarm_id)s",
{"stack_id": stack_id, "alarm_id": alarm_id}
)
except Exception as e:
LOG.exception("Failed to communicate with Heat service, "
"error: %s", six.text_type(e))

View File

View File

@ -0,0 +1,27 @@
# Copyright 2019 Catalyst Cloud Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from oslo_config import fixture
from oslotest import base
from aodh import service
class TestNotifierBase(base.BaseTestCase):
def setUp(self):
super(TestNotifierBase, self).setUp()
conf = service.prepare_service(argv=[], config_files=[])
self.conf = self.useFixture(fixture.Config(conf)).conf

View File

@ -0,0 +1,78 @@
# Copyright 2019 Catalyst Cloud Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import mock
from oslo_utils import netutils
from aodh.notifier import heat as heat_notifier
from aodh.tests.unit.notifier import base
class TestTrustHeatAlarmNotifier(base.TestNotifierBase):
@mock.patch("aodh.keystone_client.get_heat_client_from_trust")
def test_notify(self, mock_heatclient):
action = netutils.urlsplit("trust+autohealer://fake_trust_id:delete@")
alarm_id = "fake_alarm_id"
alarm_name = "fake_alarm_name"
severity = "low"
previous = "ok"
current = "alarm"
reason = "no good reason"
reason_data = {
"stack_id": "fake_stack_id",
"asg_id": "fake_asg_id",
"unhealthy_members": [
{"tags": ["3bd8bc5a-7632-11e9-84cd-00224d6b7bc1"]}
]
}
notifier = heat_notifier.TrustHeatAlarmNotifier(self.conf)
notifier.notify(action, alarm_id, alarm_name, severity, previous,
current, reason, reason_data)
mock_heatclient.assert_called_once_with(self.conf, "fake_trust_id")
mock_client = mock_heatclient.return_value
mock_client.resources.mark_unhealthy.assert_called_once_with(
"fake_asg_id",
"3bd8bc5a-7632-11e9-84cd-00224d6b7bc1",
True,
"unhealthy load balancer member"
)
mock_client.stacks.update.assert_called_once_with(
"fake_stack_id", existing=True
)
@mock.patch("aodh.keystone_client.get_heat_client_from_trust")
def test_notify_stack_id_missing(self, mock_heatclient):
action = netutils.urlsplit("trust+autohealer://fake_trust_id:delete@")
alarm_id = "fake_alarm_id"
alarm_name = "fake_alarm_name"
severity = "low"
previous = "ok"
current = "alarm"
reason = "no good reason"
reason_data = {
"asg_id": "fake_asg_id",
"unhealthy_members": [
{"tags": ["3bd8bc5a-7632-11e9-84cd-00224d6b7bc1"]}
]
}
notifier = heat_notifier.TrustHeatAlarmNotifier(self.conf)
notifier.notify(action, alarm_id, alarm_name, severity, previous,
current, reason, reason_data)
self.assertFalse(mock_heatclient.called)

View File

@ -0,0 +1,3 @@
features:
- Added a new notifier(``trust+heat``) that works together with
``loadbalancer_member_health`` evaluator for auto-healing purpose.

View File

@ -37,3 +37,4 @@ keystoneauth1>=2.1
debtcollector>=1.2.0 # Apache-2.0 debtcollector>=1.2.0 # Apache-2.0
python-octaviaclient>=1.8.0 python-octaviaclient>=1.8.0
python-dateutil # BSD python-dateutil # BSD
python-heatclient>=1.17.0

View File

@ -91,6 +91,7 @@ aodh.notifier =
trust+https = aodh.notifier.trust:TrustRestAlarmNotifier trust+https = aodh.notifier.trust:TrustRestAlarmNotifier
zaqar = aodh.notifier.zaqar:ZaqarAlarmNotifier zaqar = aodh.notifier.zaqar:ZaqarAlarmNotifier
trust+zaqar = aodh.notifier.zaqar:TrustZaqarAlarmNotifier trust+zaqar = aodh.notifier.zaqar:TrustZaqarAlarmNotifier
trust+heat = aodh.notifier.heat:TrustHeatAlarmNotifier
wsgi_scripts = wsgi_scripts =
aodh-api = aodh.api.app:build_wsgi_app aodh-api = aodh.api.app:build_wsgi_app