From 7296491659a2e8a04c0bcec4331a423bdfb884f0 Mon Sep 17 00:00:00 2001 From: Kobi Samoray Date: Tue, 26 Jun 2018 14:53:17 +0300 Subject: [PATCH] Housekeeper: deliver output Add functinoality to the housekeeper: - Add error count, fixed count and info fields to extension - Output the job results via email to a specified admin e-address Change-Id: Ifab4c1cb293e90d950f5e4b80a6f7cb93129e816 --- vmware_nsx/common/config.py | 12 +- vmware_nsx/extensions/housekeeper.py | 6 + .../plugins/common/housekeeper/base_job.py | 20 +++ .../plugins/common/housekeeper/housekeeper.py | 132 ++++++++++++++++-- .../nsx_v/housekeeper/error_backup_edge.py | 27 +++- .../nsx_v/housekeeper/error_dhcp_edge.py | 93 ++++++++---- .../nsx_v/housekeeper/lbaas_pending.py | 19 ++- .../housekeeper/test_error_backup_edge.py | 3 +- .../nsx_v/housekeeper/test_error_dhcp_edge.py | 3 +- 9 files changed, 270 insertions(+), 45 deletions(-) diff --git a/vmware_nsx/common/config.py b/vmware_nsx/common/config.py index 9a307ebf3f..d04f85a105 100644 --- a/vmware_nsx/common/config.py +++ b/vmware_nsx/common/config.py @@ -229,6 +229,7 @@ cluster_opts = [ "network connection")), ] + nsx_common_opts = [ cfg.StrOpt('nsx_l2gw_driver', help=_("Specify the class path for the Layer 2 gateway " @@ -251,8 +252,17 @@ nsx_common_opts = [ help=_("An ordered list of extension driver " "entrypoints to be loaded from the " "vmware_nsx.extension_drivers namespace.")), + cfg.StrOpt('smtp_gateway', + help=_("(Optional) IP address of SMTP gateway to use for" + "admin warnings.")), + cfg.StrOpt('smtp_from_addr', + help=_("(Optional) email address to use for outgoing admin" + "notifications.")), + cfg.ListOpt('snmp_to_list', + default=[], + help=_("(Optional) List of email addresses for " + "notifications.")), ] - nsx_v3_opts = [ cfg.ListOpt('nsx_api_user', default=['admin'], diff --git a/vmware_nsx/extensions/housekeeper.py b/vmware_nsx/extensions/housekeeper.py index 8916444960..fae2c63e22 100644 --- a/vmware_nsx/extensions/housekeeper.py +++ b/vmware_nsx/extensions/housekeeper.py @@ -34,6 +34,12 @@ RESOURCE_ATTRIBUTE_MAP = { 'allow_post': False, 'allow_put': False, 'is_visible': True}, 'enabled': { 'allow_post': False, 'allow_put': False, 'is_visible': True}, + 'error_count': { + 'allow_post': False, 'allow_put': False, 'is_visible': True}, + 'fixed_count': { + 'allow_post': False, 'allow_put': False, 'is_visible': True}, + 'error_info': { + 'allow_post': False, 'allow_put': False, 'is_visible': True}, } } diff --git a/vmware_nsx/plugins/common/housekeeper/base_job.py b/vmware_nsx/plugins/common/housekeeper/base_job.py index 5dc4cc1800..c06df71ee1 100644 --- a/vmware_nsx/plugins/common/housekeeper/base_job.py +++ b/vmware_nsx/plugins/common/housekeeper/base_job.py @@ -59,3 +59,23 @@ class BaseJob(object): @abc.abstractmethod def get_project_plugin(self, plugin): pass + + +def housekeeper_info(info, fmt, *args): + msg = fmt % args + if info: + info = "%s\n%s" % (info, msg) + else: + info = msg + LOG.info("Housekeeping: %s", msg) + return info + + +def housekeeper_warning(info, fmt, *args): + msg = fmt % args + if info: + info = "%s\n%s" % (info, msg) + else: + info = msg + LOG.warning("Housekeeping: %s", msg) + return info diff --git a/vmware_nsx/plugins/common/housekeeper/housekeeper.py b/vmware_nsx/plugins/common/housekeeper/housekeeper.py index 3037187acf..48044952fc 100644 --- a/vmware_nsx/plugins/common/housekeeper/housekeeper.py +++ b/vmware_nsx/plugins/common/housekeeper/housekeeper.py @@ -13,6 +13,11 @@ # License for the specific language governing permissions and limitations # under the License. +import smtplib + +from email.mime.multipart import MIMEMultipart +from email.mime.text import MIMEText + from oslo_config import cfg from oslo_log import log import stevedore @@ -24,12 +29,23 @@ LOG = log.getLogger(__name__) ALL_DUMMY_JOB = { 'name': 'all', 'description': 'Execute all housekeepers', - 'enabled': True} + 'enabled': True, + 'error_count': 0, + 'fixed_count': 0, + 'error_info': None} class NsxvHousekeeper(stevedore.named.NamedExtensionManager): def __init__(self, hk_ns, hk_jobs): + self.email_notifier = None + if (cfg.CONF.smtp_gateway and + cfg.CONF.smtp_from_addr and + cfg.CONF.snmp_to_list): + self.email_notifier = HousekeeperEmailNotifier() + self.readonly = cfg.CONF.nsxv.housekeeping_readonly + self.results = {} + if self.readonly: LOG.info('Housekeeper initialized in readonly mode') else: @@ -45,40 +61,140 @@ class NsxvHousekeeper(stevedore.named.NamedExtensionManager): self.jobs[job.obj.get_name()] = job.obj def get(self, job_name): - if job_name == ALL_DUMMY_JOB.get('name'): - return ALL_DUMMY_JOB + if job_name == ALL_DUMMY_JOB['name']: + return {'name': job_name, + 'description': ALL_DUMMY_JOB['description'], + 'enabled': job_name in self.jobs, + 'error_count': self.results.get( + job_name, {}).get('error_count', 0), + 'fixed_count': self.results.get( + job_name, {}).get('fixed_count', 0), + 'error_info': self.results.get( + job_name, {}).get('error_info', '')} for job in self: name = job.obj.get_name() if job_name == name: return {'name': job_name, 'description': job.obj.get_description(), - 'enabled': job_name in self.jobs} + 'enabled': job_name in self.jobs, + 'error_count': self.results.get( + job_name, {}).get('error_count', 0), + 'fixed_count': self.results.get( + job_name, {}).get('fixed_count', 0), + 'error_info': self.results.get( + job_name, {}).get('error_info', '')} raise n_exc.ObjectNotFound(id=job_name) def list(self): - results = [ALL_DUMMY_JOB] + results = [{'name': ALL_DUMMY_JOB['name'], + 'description': ALL_DUMMY_JOB['description'], + 'enabled': ALL_DUMMY_JOB['name'] in self.jobs, + 'error_count': self.results.get( + ALL_DUMMY_JOB['name'], {}).get('error_count', 0), + 'fixed_count': self.results.get( + ALL_DUMMY_JOB['name'], {}).get('fixed_count', 0), + 'error_info': self.results.get( + ALL_DUMMY_JOB['name'], {}).get('error_info', '')}] for job in self: job_name = job.obj.get_name() results.append({'name': job_name, 'description': job.obj.get_description(), - 'enabled': job_name in self.jobs}) + 'enabled': job_name in self.jobs, + 'error_count': self.results.get( + job_name, {}).get('error_count', 0), + 'fixed_count': self.results.get( + job_name, {}).get('fixed_count', 0), + 'error_info': self.results.get( + job_name, {}).get('error_info', '')}) return results def run(self, context, job_name): + self.results = {} if context.is_admin: + if self.email_notifier: + self.email_notifier.start('Cloud Housekeeper Execution Report') + with locking.LockManager.get_lock('nsx-housekeeper'): + error_count = 0 + fixed_count = 0 + error_info = '' if job_name == ALL_DUMMY_JOB.get('name'): for job in self.jobs.values(): - job.run(context) + result = job.run(context) + if result: + if self.email_notifier and result['error_count']: + self._add_job_text_to_notifier(job, result) + error_count += result['error_count'] + fixed_count += result['fixed_count'] + error_info += result['error_info'] + "\n" + self.results[job_name] = { + 'error_count': error_count, + 'fixed_count': fixed_count, + 'error_info': error_info + } + else: job = self.jobs.get(job_name) if job: - job.run(context) + result = job.run(context) + if result: + error_count = result['error_count'] + if self.email_notifier: + self._add_job_text_to_notifier(job, result) + self.results[job.get_name()] = result else: raise n_exc.ObjectNotFound(id=job_name) + + if self.email_notifier and error_count: + self.email_notifier.send() else: raise n_exc.AdminRequired() + + def _add_job_text_to_notifier(self, job, result): + self.email_notifier.add_text("%s:", job.get_name()) + self.email_notifier.add_text( + '%d errors found, %d fixed\n%s\n\n', + result['error_count'], + result['fixed_count'], + result['error_info']) + + +class HousekeeperEmailNotifier(object): + def __init__(self): + self.msg = None + self.html = None + self.has_text = False + + def start(self, subject): + self.msg = MIMEMultipart('alternative') + self.msg['Subject'] = subject + self.msg['From'] = cfg.CONF.smtp_from_addr + self.msg['To'] = ', '.join(cfg.CONF.snmp_to_list) + self.html = '
' + self.has_text = False + + def add_text(self, fmt, *args): + self.has_text = True + text = fmt % args + LOG.debug("Housekeeper emailer adding text %s", text) + self.html += text.replace("\n", "
") + "
\n" + + def send(self): + if self.has_text: + self.html += "
" + part1 = MIMEText(self.html, 'html') + self.msg.attach(part1) + + s = smtplib.SMTP(cfg.CONF.smtp_gateway) + + s.sendmail(cfg.CONF.smtp_from_addr, + cfg.CONF.snmp_to_list, + self.msg.as_string()) + s.quit() + + self.msg = None + self.html = None diff --git a/vmware_nsx/plugins/nsx_v/housekeeper/error_backup_edge.py b/vmware_nsx/plugins/nsx_v/housekeeper/error_backup_edge.py index 1918942b1e..e0cd749433 100644 --- a/vmware_nsx/plugins/nsx_v/housekeeper/error_backup_edge.py +++ b/vmware_nsx/plugins/nsx_v/housekeeper/error_backup_edge.py @@ -44,6 +44,9 @@ class ErrorBackupEdgeJob(base_job.BaseJob): def run(self, context): super(ErrorBackupEdgeJob, self).run(context) + error_count = 0 + fixed_count = 0 + error_info = '' # Gather ERROR state backup edges into dict filters = {'status': [constants.ERROR]} @@ -54,20 +57,30 @@ class ErrorBackupEdgeJob(base_job.BaseJob): if not error_edge_bindings: LOG.debug('Housekeeping: no backup edges in ERROR state detected') - return + return {'error_count': 0, + 'fixed_count': 0, + 'error_info': 'No backup edges in ERROR state detected'} # Keep list of current broken backup edges - as it may change while # HK is running for binding in error_edge_bindings: - LOG.warning('Housekeeping: Backup Edge appliance %s is in ERROR' - ' state', binding['edge_id']) + error_count += 1 + error_info = base_job.housekeeper_warning( + error_info, 'Backup Edge appliance %s is in ERROR state', + binding['edge_id']) if not self.readonly: with locking.LockManager.get_lock(binding['edge_id']): - self._handle_backup_edge(context, binding) + if self._handle_backup_edge(context, binding): + fixed_count += 1 + + return {'error_count': error_count, + 'fixed_count': fixed_count, + 'error_info': error_info} def _handle_backup_edge(self, context, binding): dist = (binding['edge_type'] == nsxv_constants.VDR_EDGE) + result = True az = self.azs.get_availability_zone( binding['availability_zone']) try: @@ -90,7 +103,9 @@ class ErrorBackupEdgeJob(base_job.BaseJob): if not update_result: LOG.warning('Housekeeping: failed to recover Edge ' 'appliance %s, trying to delete', binding['edge_id']) - self._delete_edge(context, binding, dist) + result = self._delete_edge(context, binding, dist) + + return result def _delete_edge(self, context, binding, dist): try: @@ -104,6 +119,8 @@ class ErrorBackupEdgeJob(base_job.BaseJob): try: self.plugin.nsx_v.delete_edge(context, binding['router_id'], binding['edge_id'], dist=dist) + return True + except Exception as e: LOG.warning('Housekeeping: Failed to delete edge %s with ' 'exception %s', binding['edge_id'], e) diff --git a/vmware_nsx/plugins/nsx_v/housekeeper/error_dhcp_edge.py b/vmware_nsx/plugins/nsx_v/housekeeper/error_dhcp_edge.py index 1741675291..08821f7cfe 100644 --- a/vmware_nsx/plugins/nsx_v/housekeeper/error_dhcp_edge.py +++ b/vmware_nsx/plugins/nsx_v/housekeeper/error_dhcp_edge.py @@ -27,6 +27,12 @@ LOG = log.getLogger(__name__) class ErrorDhcpEdgeJob(base_job.BaseJob): + def __init__(self, readonly): + super(ErrorDhcpEdgeJob, self).__init__(readonly) + self.error_count = 0 + self.fixed_count = 0 + self.fixed_sub_if_count = 0 + self.error_info = '' def get_project_plugin(self, plugin): return plugin.get_plugin_by_type(projectpluginmap.NsxPlugins.NSX_V) @@ -39,6 +45,10 @@ class ErrorDhcpEdgeJob(base_job.BaseJob): def run(self, context): super(ErrorDhcpEdgeJob, self).run(context) + self.error_count = 0 + self.fixed_count = 0 + self.fixed_sub_if_count = 0 + self.error_info = '' # Gather ERROR state DHCP edges into dict filters = {'status': [constants.ERROR]} @@ -47,7 +57,9 @@ class ErrorDhcpEdgeJob(base_job.BaseJob): if not error_edge_bindings: LOG.debug('Housekeeping: no DHCP edges in ERROR state detected') - return + return {'error_count': self.error_count, + 'fixed_count': self.fixed_count, + 'error_info': 'No DHCP error state edges detected'} with locking.LockManager.get_lock('nsx-dhcp-edge-pool'): edge_dict = {} @@ -70,8 +82,14 @@ class ErrorDhcpEdgeJob(base_job.BaseJob): self._validate_dhcp_edge( context, edge_dict, pfx_dict, networks, edge_id) except Exception as e: - LOG.error('Failed to recover DHCP Edge %s (%s)', - edge_id, e) + self.error_count += 1 + self.error_info = base_job.housekeeper_warning( + self.error_info, + 'Failed to recover DHCP Edge %s (%s)', edge_id, e) + + return {'error_count': self.error_count, + 'fixed_count': self.fixed_count, + 'error_info': self.error_info} def _validate_dhcp_edge( self, context, edge_dict, pfx_dict, networks, edge_id): @@ -95,21 +113,29 @@ class ErrorDhcpEdgeJob(base_job.BaseJob): if net_id is None: # Delete router binding as we do not have such network # in Neutron - LOG.warning('Housekeeping: router binding %s for edge ' - '%s has no matching neutron network', - router_id, edge_id) + self.error_count += 1 + self.error_info = base_job.housekeeper_warning( + self.error_info, + 'router binding %s for edge %s has no matching ' + 'neutron network', router_id, edge_id) + if not self.readonly: nsxv_db.delete_nsxv_router_binding( context.session, binding['router_id']) + self.fixed_count += 1 else: if net_id not in edge_networks: # Create vNic bind here - LOG.warning('Housekeeping: edge %s vnic binding ' - 'missing for network %s', edge_id, - net_id) + self.error_count += 1 + self.error_info = base_job.housekeeper_warning( + self.error_info, + 'edge %s vnic binding missing for network %s', + edge_id, net_id) + if not self.readonly: nsxv_db.allocate_edge_vnic_with_tunnel_index( context.session, edge_id, net_id, az_name) + self.fixed_count += 1 # Step (B) # Find vNic bindings which reference invalid networks or aren't @@ -122,12 +148,16 @@ class ErrorDhcpEdgeJob(base_job.BaseJob): for bind in vnic_binds: if bind['network_id'] not in networks: - LOG.warning('Housekeeping: edge vnic binding for edge ' - '%s is for invalid network id %s', - edge_id, bind['network_id']) + self.error_count += 1 + self.error_info = base_job.housekeeper_warning( + self.error_info, + 'edge vnic binding for edge %s is for invalid ' + 'network id %s', edge_id, bind['network_id']) + if not self.readonly: nsxv_db.free_edge_vnic_by_network( context.session, edge_id, bind['network_id']) + self.fixed_count += 1 # Step (C) # Verify that backend is in sync with Neutron @@ -158,6 +188,8 @@ class ErrorDhcpEdgeJob(base_job.BaseJob): self._update_router_bindings(context, edge_id) + self.fixed_count += self.fixed_sub_if_count + def _validate_edge_subinterfaces(self, context, edge_id, backend_vnics, vnic_dict, if_changed): # Validate that all the interfaces on the Edge @@ -175,11 +207,13 @@ class ErrorDhcpEdgeJob(base_job.BaseJob): vnic_bind['tunnel_index'] == sub_if['tunnelId']): pass else: - LOG.warning('Housekeeping: subinterface %s for vnic ' - '%s on edge %s is not defined in ' - 'nsxv_edge_vnic_bindings', - sub_if['tunnelId'], - vnic['index'], edge_id) + self.error_count += 1 + self.error_info = base_job.housekeeper_warning( + self.error_info, + 'subinterface %s for vnic %s on edge %s is not ' + 'defined in nsxv_edge_vnic_bindings', + sub_if['tunnelId'], vnic['index'], edge_id) + self.fixed_sub_if_count += 1 if_changed[vnic['index']] = True vnic['subInterfaces']['subInterfaces'].remove(sub_if) @@ -210,27 +244,34 @@ class ErrorDhcpEdgeJob(base_job.BaseJob): if sub_if['tunnelId'] == tunnel_index: found = True if sub_if.get('logicalSwitchName') != network_id: - LOG.warning('Housekeeping: subinterface %s on ' - 'vnic %s on edge %s should be ' - 'connected to network %s', - tunnel_index, vnic['index'], - edge_id, network_id) + self.error_count += 1 + self.error_info = base_job.housekeeper_warning( + self.error_info, + 'subinterface %s on vnic %s on edge %s ' + 'should be connected to network %s', + tunnel_index, vnic['index'], edge_id, + network_id) if_changed[vnic['index']] = True if not self.readonly: self._recreate_vnic_subinterface( context, network_id, edge_id, vnic, tunnel_index) + self.fixed_count += 1 sub_if['name'] = network_id if not found: - LOG.warning('Housekeeping: subinterface %s on vnic ' - '%s on edge %s should be connected to ' - 'network %s but is missing', tunnel_index, - vnic['index'], edge_id, network_id) + self.error_count += 1 + self.error_info = base_job.housekeeper_warning( + self.error_info, + 'subinterface %s on vnic %s on edge %s should be ' + 'connected to network %s but is missing', + tunnel_index, vnic['index'], edge_id, network_id) if_changed[vnic['index']] = True + if not self.readonly: self._recreate_vnic_subinterface( context, network_id, edge_id, vnic, tunnel_index) + self.fixed_sub_if_count += 1 def _recreate_vnic_subinterface( self, context, network_id, edge_id, vnic, tunnel_index): diff --git a/vmware_nsx/plugins/nsx_v/housekeeper/lbaas_pending.py b/vmware_nsx/plugins/nsx_v/housekeeper/lbaas_pending.py index 6d7a41e314..1bb693e96e 100644 --- a/vmware_nsx/plugins/nsx_v/housekeeper/lbaas_pending.py +++ b/vmware_nsx/plugins/nsx_v/housekeeper/lbaas_pending.py @@ -49,6 +49,9 @@ class LbaasPendingJob(base_job.BaseJob): def run(self, context): super(LbaasPendingJob, self).run(context) curr_time = time.time() + error_count = 0 + fixed_count = 0 + error_info = '' for model in self.lbaas_models: sess = context.session @@ -65,11 +68,15 @@ class LbaasPendingJob(base_job.BaseJob): if lifetime > ELEMENT_LIFETIME: # Entry has been pending for more than lifetime. # Report and remove when in R/W mode - LOG.warning('Housekeeping: LBaaS %s %s is stuck in ' - 'pending state', - model.NAME, element['id']) + error_count += 1 + error_info = base_job.housekeeper_warning( + error_info, + 'LBaaS %s %s is stuck in pending state', + model.NAME, element['id']) + if not self.readonly: element['provisioning_status'] = constants.ERROR + fixed_count += 1 del self.lbaas_objects[element['id']] else: # Entry is still pending but haven't reached lifetime @@ -93,3 +100,9 @@ class LbaasPendingJob(base_job.BaseJob): LOG.debug('Housekeeping: LBaaS %s %s is back to normal', self.lbaas_objects[obj_id]['model'].NAME, obj_id) del self.lbaas_objects[obj_id] + + if error_count == 0: + error_info = 'No LBaaS objects in pending state' + return {'error_count': error_count, + 'fixed_count': fixed_count, + 'error_info': error_info} diff --git a/vmware_nsx/tests/unit/nsx_v/housekeeper/test_error_backup_edge.py b/vmware_nsx/tests/unit/nsx_v/housekeeper/test_error_backup_edge.py index ff90956726..c0aa9526f3 100644 --- a/vmware_nsx/tests/unit/nsx_v/housekeeper/test_error_backup_edge.py +++ b/vmware_nsx/tests/unit/nsx_v/housekeeper/test_error_backup_edge.py @@ -17,6 +17,7 @@ import mock from neutron.tests import base from neutron_lib.plugins import constants +from vmware_nsx.plugins.common.housekeeper import base_job from vmware_nsx.plugins.nsx_v.housekeeper import error_backup_edge FAKE_ROUTER_BINDINGS = [ @@ -42,7 +43,7 @@ class ErrorBackupEdgeTestCaseReadOnly(base.BaseTestCase): mock.patch('neutron_lib.plugins.directory.get_plugin', side_effect=get_plugin_mock).start() self.log = mock.Mock() - error_backup_edge.LOG = self.log + base_job.LOG = self.log self.job = error_backup_edge.ErrorBackupEdgeJob(self._is_readonly()) def test_clean_run(self): diff --git a/vmware_nsx/tests/unit/nsx_v/housekeeper/test_error_dhcp_edge.py b/vmware_nsx/tests/unit/nsx_v/housekeeper/test_error_dhcp_edge.py index 0854aa32ef..ca4bc4dee3 100644 --- a/vmware_nsx/tests/unit/nsx_v/housekeeper/test_error_dhcp_edge.py +++ b/vmware_nsx/tests/unit/nsx_v/housekeeper/test_error_dhcp_edge.py @@ -20,6 +20,7 @@ import mock from neutron.tests import base from neutron_lib.plugins import constants +from vmware_nsx.plugins.common.housekeeper import base_job from vmware_nsx.plugins.nsx_v.housekeeper import error_dhcp_edge FAKE_ROUTER_BINDINGS = [ @@ -289,7 +290,7 @@ class ErrorDhcpEdgeTestCaseReadOnly(base.BaseTestCase): mock.patch.object(self.plugin, 'get_availability_zone_name_by_edge', return_value='default').start() self.log = mock.Mock() - error_dhcp_edge.LOG = self.log + base_job.LOG = self.log self.job = error_dhcp_edge.ErrorDhcpEdgeJob(self._is_readonly()) def test_clean_run(self):