diff --git a/vmware_nsx/common/config.py b/vmware_nsx/common/config.py
index 9a307ebf3f..d04f85a105 100644
--- a/vmware_nsx/common/config.py
+++ b/vmware_nsx/common/config.py
@@ -229,6 +229,7 @@ cluster_opts = [
"network connection")),
]
+
nsx_common_opts = [
cfg.StrOpt('nsx_l2gw_driver',
help=_("Specify the class path for the Layer 2 gateway "
@@ -251,8 +252,17 @@ nsx_common_opts = [
help=_("An ordered list of extension driver "
"entrypoints to be loaded from the "
"vmware_nsx.extension_drivers namespace.")),
+ cfg.StrOpt('smtp_gateway',
+ help=_("(Optional) IP address of SMTP gateway to use for"
+ "admin warnings.")),
+ cfg.StrOpt('smtp_from_addr',
+ help=_("(Optional) email address to use for outgoing admin"
+ "notifications.")),
+ cfg.ListOpt('snmp_to_list',
+ default=[],
+ help=_("(Optional) List of email addresses for "
+ "notifications.")),
]
-
nsx_v3_opts = [
cfg.ListOpt('nsx_api_user',
default=['admin'],
diff --git a/vmware_nsx/extensions/housekeeper.py b/vmware_nsx/extensions/housekeeper.py
index 8916444960..fae2c63e22 100644
--- a/vmware_nsx/extensions/housekeeper.py
+++ b/vmware_nsx/extensions/housekeeper.py
@@ -34,6 +34,12 @@ RESOURCE_ATTRIBUTE_MAP = {
'allow_post': False, 'allow_put': False, 'is_visible': True},
'enabled': {
'allow_post': False, 'allow_put': False, 'is_visible': True},
+ 'error_count': {
+ 'allow_post': False, 'allow_put': False, 'is_visible': True},
+ 'fixed_count': {
+ 'allow_post': False, 'allow_put': False, 'is_visible': True},
+ 'error_info': {
+ 'allow_post': False, 'allow_put': False, 'is_visible': True},
}
}
diff --git a/vmware_nsx/plugins/common/housekeeper/base_job.py b/vmware_nsx/plugins/common/housekeeper/base_job.py
index 5dc4cc1800..c06df71ee1 100644
--- a/vmware_nsx/plugins/common/housekeeper/base_job.py
+++ b/vmware_nsx/plugins/common/housekeeper/base_job.py
@@ -59,3 +59,23 @@ class BaseJob(object):
@abc.abstractmethod
def get_project_plugin(self, plugin):
pass
+
+
+def housekeeper_info(info, fmt, *args):
+ msg = fmt % args
+ if info:
+ info = "%s\n%s" % (info, msg)
+ else:
+ info = msg
+ LOG.info("Housekeeping: %s", msg)
+ return info
+
+
+def housekeeper_warning(info, fmt, *args):
+ msg = fmt % args
+ if info:
+ info = "%s\n%s" % (info, msg)
+ else:
+ info = msg
+ LOG.warning("Housekeeping: %s", msg)
+ return info
diff --git a/vmware_nsx/plugins/common/housekeeper/housekeeper.py b/vmware_nsx/plugins/common/housekeeper/housekeeper.py
index 3037187acf..48044952fc 100644
--- a/vmware_nsx/plugins/common/housekeeper/housekeeper.py
+++ b/vmware_nsx/plugins/common/housekeeper/housekeeper.py
@@ -13,6 +13,11 @@
# License for the specific language governing permissions and limitations
# under the License.
+import smtplib
+
+from email.mime.multipart import MIMEMultipart
+from email.mime.text import MIMEText
+
from oslo_config import cfg
from oslo_log import log
import stevedore
@@ -24,12 +29,23 @@ LOG = log.getLogger(__name__)
ALL_DUMMY_JOB = {
'name': 'all',
'description': 'Execute all housekeepers',
- 'enabled': True}
+ 'enabled': True,
+ 'error_count': 0,
+ 'fixed_count': 0,
+ 'error_info': None}
class NsxvHousekeeper(stevedore.named.NamedExtensionManager):
def __init__(self, hk_ns, hk_jobs):
+ self.email_notifier = None
+ if (cfg.CONF.smtp_gateway and
+ cfg.CONF.smtp_from_addr and
+ cfg.CONF.snmp_to_list):
+ self.email_notifier = HousekeeperEmailNotifier()
+
self.readonly = cfg.CONF.nsxv.housekeeping_readonly
+ self.results = {}
+
if self.readonly:
LOG.info('Housekeeper initialized in readonly mode')
else:
@@ -45,40 +61,140 @@ class NsxvHousekeeper(stevedore.named.NamedExtensionManager):
self.jobs[job.obj.get_name()] = job.obj
def get(self, job_name):
- if job_name == ALL_DUMMY_JOB.get('name'):
- return ALL_DUMMY_JOB
+ if job_name == ALL_DUMMY_JOB['name']:
+ return {'name': job_name,
+ 'description': ALL_DUMMY_JOB['description'],
+ 'enabled': job_name in self.jobs,
+ 'error_count': self.results.get(
+ job_name, {}).get('error_count', 0),
+ 'fixed_count': self.results.get(
+ job_name, {}).get('fixed_count', 0),
+ 'error_info': self.results.get(
+ job_name, {}).get('error_info', '')}
for job in self:
name = job.obj.get_name()
if job_name == name:
return {'name': job_name,
'description': job.obj.get_description(),
- 'enabled': job_name in self.jobs}
+ 'enabled': job_name in self.jobs,
+ 'error_count': self.results.get(
+ job_name, {}).get('error_count', 0),
+ 'fixed_count': self.results.get(
+ job_name, {}).get('fixed_count', 0),
+ 'error_info': self.results.get(
+ job_name, {}).get('error_info', '')}
raise n_exc.ObjectNotFound(id=job_name)
def list(self):
- results = [ALL_DUMMY_JOB]
+ results = [{'name': ALL_DUMMY_JOB['name'],
+ 'description': ALL_DUMMY_JOB['description'],
+ 'enabled': ALL_DUMMY_JOB['name'] in self.jobs,
+ 'error_count': self.results.get(
+ ALL_DUMMY_JOB['name'], {}).get('error_count', 0),
+ 'fixed_count': self.results.get(
+ ALL_DUMMY_JOB['name'], {}).get('fixed_count', 0),
+ 'error_info': self.results.get(
+ ALL_DUMMY_JOB['name'], {}).get('error_info', '')}]
for job in self:
job_name = job.obj.get_name()
results.append({'name': job_name,
'description': job.obj.get_description(),
- 'enabled': job_name in self.jobs})
+ 'enabled': job_name in self.jobs,
+ 'error_count': self.results.get(
+ job_name, {}).get('error_count', 0),
+ 'fixed_count': self.results.get(
+ job_name, {}).get('fixed_count', 0),
+ 'error_info': self.results.get(
+ job_name, {}).get('error_info', '')})
return results
def run(self, context, job_name):
+ self.results = {}
if context.is_admin:
+ if self.email_notifier:
+ self.email_notifier.start('Cloud Housekeeper Execution Report')
+
with locking.LockManager.get_lock('nsx-housekeeper'):
+ error_count = 0
+ fixed_count = 0
+ error_info = ''
if job_name == ALL_DUMMY_JOB.get('name'):
for job in self.jobs.values():
- job.run(context)
+ result = job.run(context)
+ if result:
+ if self.email_notifier and result['error_count']:
+ self._add_job_text_to_notifier(job, result)
+ error_count += result['error_count']
+ fixed_count += result['fixed_count']
+ error_info += result['error_info'] + "\n"
+ self.results[job_name] = {
+ 'error_count': error_count,
+ 'fixed_count': fixed_count,
+ 'error_info': error_info
+ }
+
else:
job = self.jobs.get(job_name)
if job:
- job.run(context)
+ result = job.run(context)
+ if result:
+ error_count = result['error_count']
+ if self.email_notifier:
+ self._add_job_text_to_notifier(job, result)
+ self.results[job.get_name()] = result
else:
raise n_exc.ObjectNotFound(id=job_name)
+
+ if self.email_notifier and error_count:
+ self.email_notifier.send()
else:
raise n_exc.AdminRequired()
+
+ def _add_job_text_to_notifier(self, job, result):
+ self.email_notifier.add_text("%s:", job.get_name())
+ self.email_notifier.add_text(
+ '%d errors found, %d fixed\n%s\n\n',
+ result['error_count'],
+ result['fixed_count'],
+ result['error_info'])
+
+
+class HousekeeperEmailNotifier(object):
+ def __init__(self):
+ self.msg = None
+ self.html = None
+ self.has_text = False
+
+ def start(self, subject):
+ self.msg = MIMEMultipart('alternative')
+ self.msg['Subject'] = subject
+ self.msg['From'] = cfg.CONF.smtp_from_addr
+ self.msg['To'] = ', '.join(cfg.CONF.snmp_to_list)
+ self.html = '
'
+ self.has_text = False
+
+ def add_text(self, fmt, *args):
+ self.has_text = True
+ text = fmt % args
+ LOG.debug("Housekeeper emailer adding text %s", text)
+ self.html += text.replace("\n", "
") + "
\n"
+
+ def send(self):
+ if self.has_text:
+ self.html += "
"
+ part1 = MIMEText(self.html, 'html')
+ self.msg.attach(part1)
+
+ s = smtplib.SMTP(cfg.CONF.smtp_gateway)
+
+ s.sendmail(cfg.CONF.smtp_from_addr,
+ cfg.CONF.snmp_to_list,
+ self.msg.as_string())
+ s.quit()
+
+ self.msg = None
+ self.html = None
diff --git a/vmware_nsx/plugins/nsx_v/housekeeper/error_backup_edge.py b/vmware_nsx/plugins/nsx_v/housekeeper/error_backup_edge.py
index 1918942b1e..e0cd749433 100644
--- a/vmware_nsx/plugins/nsx_v/housekeeper/error_backup_edge.py
+++ b/vmware_nsx/plugins/nsx_v/housekeeper/error_backup_edge.py
@@ -44,6 +44,9 @@ class ErrorBackupEdgeJob(base_job.BaseJob):
def run(self, context):
super(ErrorBackupEdgeJob, self).run(context)
+ error_count = 0
+ fixed_count = 0
+ error_info = ''
# Gather ERROR state backup edges into dict
filters = {'status': [constants.ERROR]}
@@ -54,20 +57,30 @@ class ErrorBackupEdgeJob(base_job.BaseJob):
if not error_edge_bindings:
LOG.debug('Housekeeping: no backup edges in ERROR state detected')
- return
+ return {'error_count': 0,
+ 'fixed_count': 0,
+ 'error_info': 'No backup edges in ERROR state detected'}
# Keep list of current broken backup edges - as it may change while
# HK is running
for binding in error_edge_bindings:
- LOG.warning('Housekeeping: Backup Edge appliance %s is in ERROR'
- ' state', binding['edge_id'])
+ error_count += 1
+ error_info = base_job.housekeeper_warning(
+ error_info, 'Backup Edge appliance %s is in ERROR state',
+ binding['edge_id'])
if not self.readonly:
with locking.LockManager.get_lock(binding['edge_id']):
- self._handle_backup_edge(context, binding)
+ if self._handle_backup_edge(context, binding):
+ fixed_count += 1
+
+ return {'error_count': error_count,
+ 'fixed_count': fixed_count,
+ 'error_info': error_info}
def _handle_backup_edge(self, context, binding):
dist = (binding['edge_type'] == nsxv_constants.VDR_EDGE)
+ result = True
az = self.azs.get_availability_zone(
binding['availability_zone'])
try:
@@ -90,7 +103,9 @@ class ErrorBackupEdgeJob(base_job.BaseJob):
if not update_result:
LOG.warning('Housekeeping: failed to recover Edge '
'appliance %s, trying to delete', binding['edge_id'])
- self._delete_edge(context, binding, dist)
+ result = self._delete_edge(context, binding, dist)
+
+ return result
def _delete_edge(self, context, binding, dist):
try:
@@ -104,6 +119,8 @@ class ErrorBackupEdgeJob(base_job.BaseJob):
try:
self.plugin.nsx_v.delete_edge(context, binding['router_id'],
binding['edge_id'], dist=dist)
+ return True
+
except Exception as e:
LOG.warning('Housekeeping: Failed to delete edge %s with '
'exception %s', binding['edge_id'], e)
diff --git a/vmware_nsx/plugins/nsx_v/housekeeper/error_dhcp_edge.py b/vmware_nsx/plugins/nsx_v/housekeeper/error_dhcp_edge.py
index 1741675291..08821f7cfe 100644
--- a/vmware_nsx/plugins/nsx_v/housekeeper/error_dhcp_edge.py
+++ b/vmware_nsx/plugins/nsx_v/housekeeper/error_dhcp_edge.py
@@ -27,6 +27,12 @@ LOG = log.getLogger(__name__)
class ErrorDhcpEdgeJob(base_job.BaseJob):
+ def __init__(self, readonly):
+ super(ErrorDhcpEdgeJob, self).__init__(readonly)
+ self.error_count = 0
+ self.fixed_count = 0
+ self.fixed_sub_if_count = 0
+ self.error_info = ''
def get_project_plugin(self, plugin):
return plugin.get_plugin_by_type(projectpluginmap.NsxPlugins.NSX_V)
@@ -39,6 +45,10 @@ class ErrorDhcpEdgeJob(base_job.BaseJob):
def run(self, context):
super(ErrorDhcpEdgeJob, self).run(context)
+ self.error_count = 0
+ self.fixed_count = 0
+ self.fixed_sub_if_count = 0
+ self.error_info = ''
# Gather ERROR state DHCP edges into dict
filters = {'status': [constants.ERROR]}
@@ -47,7 +57,9 @@ class ErrorDhcpEdgeJob(base_job.BaseJob):
if not error_edge_bindings:
LOG.debug('Housekeeping: no DHCP edges in ERROR state detected')
- return
+ return {'error_count': self.error_count,
+ 'fixed_count': self.fixed_count,
+ 'error_info': 'No DHCP error state edges detected'}
with locking.LockManager.get_lock('nsx-dhcp-edge-pool'):
edge_dict = {}
@@ -70,8 +82,14 @@ class ErrorDhcpEdgeJob(base_job.BaseJob):
self._validate_dhcp_edge(
context, edge_dict, pfx_dict, networks, edge_id)
except Exception as e:
- LOG.error('Failed to recover DHCP Edge %s (%s)',
- edge_id, e)
+ self.error_count += 1
+ self.error_info = base_job.housekeeper_warning(
+ self.error_info,
+ 'Failed to recover DHCP Edge %s (%s)', edge_id, e)
+
+ return {'error_count': self.error_count,
+ 'fixed_count': self.fixed_count,
+ 'error_info': self.error_info}
def _validate_dhcp_edge(
self, context, edge_dict, pfx_dict, networks, edge_id):
@@ -95,21 +113,29 @@ class ErrorDhcpEdgeJob(base_job.BaseJob):
if net_id is None:
# Delete router binding as we do not have such network
# in Neutron
- LOG.warning('Housekeeping: router binding %s for edge '
- '%s has no matching neutron network',
- router_id, edge_id)
+ self.error_count += 1
+ self.error_info = base_job.housekeeper_warning(
+ self.error_info,
+ 'router binding %s for edge %s has no matching '
+ 'neutron network', router_id, edge_id)
+
if not self.readonly:
nsxv_db.delete_nsxv_router_binding(
context.session, binding['router_id'])
+ self.fixed_count += 1
else:
if net_id not in edge_networks:
# Create vNic bind here
- LOG.warning('Housekeeping: edge %s vnic binding '
- 'missing for network %s', edge_id,
- net_id)
+ self.error_count += 1
+ self.error_info = base_job.housekeeper_warning(
+ self.error_info,
+ 'edge %s vnic binding missing for network %s',
+ edge_id, net_id)
+
if not self.readonly:
nsxv_db.allocate_edge_vnic_with_tunnel_index(
context.session, edge_id, net_id, az_name)
+ self.fixed_count += 1
# Step (B)
# Find vNic bindings which reference invalid networks or aren't
@@ -122,12 +148,16 @@ class ErrorDhcpEdgeJob(base_job.BaseJob):
for bind in vnic_binds:
if bind['network_id'] not in networks:
- LOG.warning('Housekeeping: edge vnic binding for edge '
- '%s is for invalid network id %s',
- edge_id, bind['network_id'])
+ self.error_count += 1
+ self.error_info = base_job.housekeeper_warning(
+ self.error_info,
+ 'edge vnic binding for edge %s is for invalid '
+ 'network id %s', edge_id, bind['network_id'])
+
if not self.readonly:
nsxv_db.free_edge_vnic_by_network(
context.session, edge_id, bind['network_id'])
+ self.fixed_count += 1
# Step (C)
# Verify that backend is in sync with Neutron
@@ -158,6 +188,8 @@ class ErrorDhcpEdgeJob(base_job.BaseJob):
self._update_router_bindings(context, edge_id)
+ self.fixed_count += self.fixed_sub_if_count
+
def _validate_edge_subinterfaces(self, context, edge_id, backend_vnics,
vnic_dict, if_changed):
# Validate that all the interfaces on the Edge
@@ -175,11 +207,13 @@ class ErrorDhcpEdgeJob(base_job.BaseJob):
vnic_bind['tunnel_index'] == sub_if['tunnelId']):
pass
else:
- LOG.warning('Housekeeping: subinterface %s for vnic '
- '%s on edge %s is not defined in '
- 'nsxv_edge_vnic_bindings',
- sub_if['tunnelId'],
- vnic['index'], edge_id)
+ self.error_count += 1
+ self.error_info = base_job.housekeeper_warning(
+ self.error_info,
+ 'subinterface %s for vnic %s on edge %s is not '
+ 'defined in nsxv_edge_vnic_bindings',
+ sub_if['tunnelId'], vnic['index'], edge_id)
+ self.fixed_sub_if_count += 1
if_changed[vnic['index']] = True
vnic['subInterfaces']['subInterfaces'].remove(sub_if)
@@ -210,27 +244,34 @@ class ErrorDhcpEdgeJob(base_job.BaseJob):
if sub_if['tunnelId'] == tunnel_index:
found = True
if sub_if.get('logicalSwitchName') != network_id:
- LOG.warning('Housekeeping: subinterface %s on '
- 'vnic %s on edge %s should be '
- 'connected to network %s',
- tunnel_index, vnic['index'],
- edge_id, network_id)
+ self.error_count += 1
+ self.error_info = base_job.housekeeper_warning(
+ self.error_info,
+ 'subinterface %s on vnic %s on edge %s '
+ 'should be connected to network %s',
+ tunnel_index, vnic['index'], edge_id,
+ network_id)
if_changed[vnic['index']] = True
if not self.readonly:
self._recreate_vnic_subinterface(
context, network_id, edge_id, vnic,
tunnel_index)
+ self.fixed_count += 1
sub_if['name'] = network_id
if not found:
- LOG.warning('Housekeeping: subinterface %s on vnic '
- '%s on edge %s should be connected to '
- 'network %s but is missing', tunnel_index,
- vnic['index'], edge_id, network_id)
+ self.error_count += 1
+ self.error_info = base_job.housekeeper_warning(
+ self.error_info,
+ 'subinterface %s on vnic %s on edge %s should be '
+ 'connected to network %s but is missing',
+ tunnel_index, vnic['index'], edge_id, network_id)
if_changed[vnic['index']] = True
+
if not self.readonly:
self._recreate_vnic_subinterface(
context, network_id, edge_id, vnic,
tunnel_index)
+ self.fixed_sub_if_count += 1
def _recreate_vnic_subinterface(
self, context, network_id, edge_id, vnic, tunnel_index):
diff --git a/vmware_nsx/plugins/nsx_v/housekeeper/lbaas_pending.py b/vmware_nsx/plugins/nsx_v/housekeeper/lbaas_pending.py
index 6d7a41e314..1bb693e96e 100644
--- a/vmware_nsx/plugins/nsx_v/housekeeper/lbaas_pending.py
+++ b/vmware_nsx/plugins/nsx_v/housekeeper/lbaas_pending.py
@@ -49,6 +49,9 @@ class LbaasPendingJob(base_job.BaseJob):
def run(self, context):
super(LbaasPendingJob, self).run(context)
curr_time = time.time()
+ error_count = 0
+ fixed_count = 0
+ error_info = ''
for model in self.lbaas_models:
sess = context.session
@@ -65,11 +68,15 @@ class LbaasPendingJob(base_job.BaseJob):
if lifetime > ELEMENT_LIFETIME:
# Entry has been pending for more than lifetime.
# Report and remove when in R/W mode
- LOG.warning('Housekeeping: LBaaS %s %s is stuck in '
- 'pending state',
- model.NAME, element['id'])
+ error_count += 1
+ error_info = base_job.housekeeper_warning(
+ error_info,
+ 'LBaaS %s %s is stuck in pending state',
+ model.NAME, element['id'])
+
if not self.readonly:
element['provisioning_status'] = constants.ERROR
+ fixed_count += 1
del self.lbaas_objects[element['id']]
else:
# Entry is still pending but haven't reached lifetime
@@ -93,3 +100,9 @@ class LbaasPendingJob(base_job.BaseJob):
LOG.debug('Housekeeping: LBaaS %s %s is back to normal',
self.lbaas_objects[obj_id]['model'].NAME, obj_id)
del self.lbaas_objects[obj_id]
+
+ if error_count == 0:
+ error_info = 'No LBaaS objects in pending state'
+ return {'error_count': error_count,
+ 'fixed_count': fixed_count,
+ 'error_info': error_info}
diff --git a/vmware_nsx/tests/unit/nsx_v/housekeeper/test_error_backup_edge.py b/vmware_nsx/tests/unit/nsx_v/housekeeper/test_error_backup_edge.py
index ff90956726..c0aa9526f3 100644
--- a/vmware_nsx/tests/unit/nsx_v/housekeeper/test_error_backup_edge.py
+++ b/vmware_nsx/tests/unit/nsx_v/housekeeper/test_error_backup_edge.py
@@ -17,6 +17,7 @@ import mock
from neutron.tests import base
from neutron_lib.plugins import constants
+from vmware_nsx.plugins.common.housekeeper import base_job
from vmware_nsx.plugins.nsx_v.housekeeper import error_backup_edge
FAKE_ROUTER_BINDINGS = [
@@ -42,7 +43,7 @@ class ErrorBackupEdgeTestCaseReadOnly(base.BaseTestCase):
mock.patch('neutron_lib.plugins.directory.get_plugin',
side_effect=get_plugin_mock).start()
self.log = mock.Mock()
- error_backup_edge.LOG = self.log
+ base_job.LOG = self.log
self.job = error_backup_edge.ErrorBackupEdgeJob(self._is_readonly())
def test_clean_run(self):
diff --git a/vmware_nsx/tests/unit/nsx_v/housekeeper/test_error_dhcp_edge.py b/vmware_nsx/tests/unit/nsx_v/housekeeper/test_error_dhcp_edge.py
index 0854aa32ef..ca4bc4dee3 100644
--- a/vmware_nsx/tests/unit/nsx_v/housekeeper/test_error_dhcp_edge.py
+++ b/vmware_nsx/tests/unit/nsx_v/housekeeper/test_error_dhcp_edge.py
@@ -20,6 +20,7 @@ import mock
from neutron.tests import base
from neutron_lib.plugins import constants
+from vmware_nsx.plugins.common.housekeeper import base_job
from vmware_nsx.plugins.nsx_v.housekeeper import error_dhcp_edge
FAKE_ROUTER_BINDINGS = [
@@ -289,7 +290,7 @@ class ErrorDhcpEdgeTestCaseReadOnly(base.BaseTestCase):
mock.patch.object(self.plugin, 'get_availability_zone_name_by_edge',
return_value='default').start()
self.log = mock.Mock()
- error_dhcp_edge.LOG = self.log
+ base_job.LOG = self.log
self.job = error_dhcp_edge.ErrorDhcpEdgeJob(self._is_readonly())
def test_clean_run(self):