diff --git a/doc/source/housekeeper.rst b/doc/source/housekeeper.rst index bcdb9486cc..8576b1fbfd 100644 --- a/doc/source/housekeeper.rst +++ b/doc/source/housekeeper.rst @@ -60,3 +60,6 @@ error_dhcp_edge: scans for DHCP Edge appliances which are in ERROR state. When in non-readonly mode, the job will attempt recovery of the DHCP edges by removing stale elements from the Neutron DB and reconfigure the interfaces at the backend when required. + +error_backup_edge: scans from backup Edge appliances which are in ERROR state. +When in non-readonly mode, the job will reset the Edge appliance configuration. diff --git a/setup.cfg b/setup.cfg index 273279206f..ebc909569a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -76,6 +76,7 @@ openstack.nsxclient.v2 = project_plugin_list = vmware_nsx.osc.v2.project_plugin_map:ListProjectPluginMap vmware_nsx.neutron.nsxv.housekeeper.jobs = error_dhcp_edge = vmware_nsx.plugins.nsx_v.housekeeper.error_dhcp_edge:ErrorDhcpEdgeJob + error_backup_edge = vmware_nsx.plugins.nsx_v.housekeeper.error_backup_edge:ErrorBackupEdgeJob [build_sphinx] source-dir = doc/source diff --git a/vmware_nsx/common/config.py b/vmware_nsx/common/config.py index 202a41ffc5..da31be99d7 100644 --- a/vmware_nsx/common/config.py +++ b/vmware_nsx/common/config.py @@ -707,7 +707,7 @@ nsxv_opts = [ help=_("If False, different tenants will not use the same " "DHCP edge or router edge.")), cfg.ListOpt('housekeeping_jobs', - default=['error_dhcp_edge'], + default=['error_dhcp_edge', 'error_backup_edge'], help=_("List of the enabled housekeeping jobs")), cfg.BoolOpt('housekeeping_readonly', default=True, diff --git a/vmware_nsx/plugins/nsx_v/housekeeper/error_backup_edge.py b/vmware_nsx/plugins/nsx_v/housekeeper/error_backup_edge.py new file mode 100644 index 0000000000..df780e7bc9 --- /dev/null +++ b/vmware_nsx/plugins/nsx_v/housekeeper/error_backup_edge.py @@ -0,0 +1,105 @@ +# Copyright 2017 VMware, Inc. +# All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from neutron_lib import constants +from oslo_log import log +from sqlalchemy.orm import exc as sa_exc + +from vmware_nsx.common import locking +from vmware_nsx.common import nsxv_constants +from vmware_nsx.db import nsxv_db +from vmware_nsx.plugins.common.housekeeper import base_job +from vmware_nsx.plugins.nsx_v import availability_zones as nsx_az +from vmware_nsx.plugins.nsx_v.vshield.common import constants as vcns_const + +LOG = log.getLogger(__name__) + + +class ErrorBackupEdgeJob(base_job.BaseJob): + def __init__(self, readonly): + super(ErrorBackupEdgeJob, self).__init__(readonly) + self.azs = nsx_az.NsxVAvailabilityZones() + + def get_name(self): + return 'error_backup_edge' + + def get_description(self): + return 'revalidate backup Edge appliances in ERROR state' + + def run(self, context): + super(ErrorBackupEdgeJob, self).run(context) + + # Gather ERROR state backup edges into dict + filters = {'status': [constants.ERROR]} + like_filters = {'router_id': vcns_const.BACKUP_ROUTER_PREFIX + "%"} + with locking.LockManager.get_lock('nsx-edge-backup-pool'): + error_edge_bindings = nsxv_db.get_nsxv_router_bindings( + context.session, filters=filters, like_filters=like_filters) + + if not error_edge_bindings: + LOG.debug('Housekeeping: no backup edges in ERROR state detected') + return + + # Keep list of current broken backup edges - as it may change while + # HK is running + for binding in error_edge_bindings: + LOG.warning('Housekeeping: Backup Edge appliance %s is in ERROR' + ' state', binding['edge_id']) + + if not self.readonly: + with locking.LockManager.get_lock(binding['edge_id']): + self._handle_backup_edge(context, binding) + + def _handle_backup_edge(self, context, binding): + dist = (binding['edge_type'] == nsxv_constants.VDR_EDGE) + az = self.azs.get_availability_zone( + binding['availability_zone']) + try: + update_result = self.plugin.nsx_v.update_edge( + context, binding['router_id'], binding['edge_id'], + binding['router_id'], None, + appliance_size=binding['appliance_size'], + dist=dist, availability_zone=az) + + if update_result: + nsxv_db.update_nsxv_router_binding( + context.session, binding['router_id'], + status=constants.ACTIVE) + except Exception as e: + LOG.error('Housekeeping: failed to recover Edge ' + 'appliance %s with exception %s', + binding['edge_id'], e) + update_result = False + + if not update_result: + LOG.warning('Housekeeping: failed to recover Edge ' + 'appliance %s, trying to delete', binding['edge_id']) + self._delete_edge(context, binding, dist) + + def _delete_edge(self, context, binding, dist): + try: + nsxv_db.update_nsxv_router_binding( + context.session, binding['router_id'], + status=constants.PENDING_DELETE) + except sa_exc.NoResultFound: + LOG.debug("Housekeeping: Router binding %s does not exist.", + binding['router_id']) + + try: + self.plugin.nsx_v.delete_edge(context, binding['router_id'], + binding['edge_id'], dist=dist) + except Exception as e: + LOG.warning('Housekeeping: Failed to delete edge %s with ' + 'exception %s', binding['edge_id'], e) diff --git a/vmware_nsx/plugins/nsx_v/plugin.py b/vmware_nsx/plugins/nsx_v/plugin.py index 7e884b8eb1..f62293405f 100644 --- a/vmware_nsx/plugins/nsx_v/plugin.py +++ b/vmware_nsx/plugins/nsx_v/plugin.py @@ -355,7 +355,7 @@ class NsxVPluginV2(addr_pair_db.AllowedAddressPairsMixin, self.housekeeper = housekeeper.NsxvHousekeeper( hk_ns='vmware_nsx.neutron.nsxv.housekeeper.jobs', - hk_jobs=['error_dhcp_edge']) + hk_jobs=cfg.CONF.nsxv.housekeeping_jobs) self.init_is_complete = True diff --git a/vmware_nsx/plugins/nsx_v/vshield/edge_utils.py b/vmware_nsx/plugins/nsx_v/vshield/edge_utils.py index 5312a381ac..18f68ea96a 100644 --- a/vmware_nsx/plugins/nsx_v/vshield/edge_utils.py +++ b/vmware_nsx/plugins/nsx_v/vshield/edge_utils.py @@ -319,11 +319,12 @@ class EdgeManager(object): def _clean_all_error_edge_bindings(self, context, availability_zone): # Find all backup edges in error state & # backup edges which are in pending-XXX state for too long - filters = {'status': [constants.ERROR, - constants.PENDING_CREATE, + filters = {'status': [constants.PENDING_CREATE, constants.PENDING_UPDATE, constants.PENDING_DELETE], 'availability_zone': [availability_zone.name]} + if cfg.CONF.nsxv.housekeeping_readonly: + filters['status'].append(constants.ERROR) like_filters = {'router_id': vcns_const.BACKUP_ROUTER_PREFIX + "%"} router_bindings = nsxv_db.get_nsxv_router_bindings( context.session, filters=filters, like_filters=like_filters) diff --git a/vmware_nsx/tests/unit/nsx_v/housekeeper/test_error_backup_edge.py b/vmware_nsx/tests/unit/nsx_v/housekeeper/test_error_backup_edge.py new file mode 100644 index 0000000000..beba17ee65 --- /dev/null +++ b/vmware_nsx/tests/unit/nsx_v/housekeeper/test_error_backup_edge.py @@ -0,0 +1,82 @@ +# Copyright 2017 VMware, Inc. +# All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import mock +from neutron.tests import base +from neutron_lib import constants + +from vmware_nsx.plugins.nsx_v.housekeeper import error_backup_edge + +FAKE_ROUTER_BINDINGS = [ + { + 'router_id': 'backup-3b0b1fe1-c984', 'status': 'ERROR', + 'availability_zone': 'default', 'edge_id': 'edge-782', + 'edge_type': 'service', 'appliance_size': 'compact'}] + + +class ErrorBackupEdgeTestCaseReadOnly(base.BaseTestCase): + def _is_readonly(self): + return True + + def setUp(self): + def get_plugin_mock(alias=constants.CORE): + if alias in (constants.CORE, constants.L3): + return self.plugin + + super(ErrorBackupEdgeTestCaseReadOnly, self).setUp() + self.plugin = mock.Mock() + self.context = mock.Mock() + self.context.session = mock.Mock() + mock.patch('neutron_lib.plugins.directory.get_plugin', + side_effect=get_plugin_mock).start() + self.log = mock.Mock() + error_backup_edge.LOG = self.log + self.job = error_backup_edge.ErrorBackupEdgeJob(self._is_readonly()) + + def test_clean_run(self): + mock.patch('vmware_nsx.db.nsxv_db.get_nsxv_router_bindings', + return_value=[]).start() + self.job.run(self.context) + self.log.warning.assert_not_called() + + def test_broken_backup_edge(self): + mock.patch('vmware_nsx.db.nsxv_db.get_nsxv_router_bindings', + return_value=FAKE_ROUTER_BINDINGS).start() + + self.job.run(self.context) + self.log.warning.assert_called_once() + + +class ErrorBackupEdgeTestCaseReadWrite(ErrorBackupEdgeTestCaseReadOnly): + def _is_readonly(self): + return False + + def test_broken_backup_edge(self): + upd_binding = mock.patch( + 'vmware_nsx.db.nsxv_db.update_nsxv_router_binding').start() + upd_edge = mock.patch.object(self.plugin.nsx_v, 'update_edge').start() + self.job.azs = mock.Mock() + az = mock.Mock() + mock.patch.object(self.job.azs, 'get_availability_zone', + return_value=az).start() + super(ErrorBackupEdgeTestCaseReadWrite, self + ).test_broken_backup_edge() + upd_binding.assert_has_calls( + [mock.call(mock.ANY, r['router_id'], status='ACTIVE') + for r in FAKE_ROUTER_BINDINGS]) + upd_edge.assert_called_with( + self.context, 'backup-3b0b1fe1-c984', 'edge-782', + 'backup-3b0b1fe1-c984', None, appliance_size='compact', + availability_zone=az, dist=False)