Housekeeper: deliver output

Add functinoality to the housekeeper:
- Add error count, fixed count and info fields to extension
- Output the job results via email to a specified admin e-address

Change-Id: Ifab4c1cb293e90d950f5e4b80a6f7cb93129e816
This commit is contained in:
Kobi Samoray 2018-06-26 14:53:17 +03:00
parent f570c651bf
commit 7296491659
9 changed files with 270 additions and 45 deletions

View File

@ -229,6 +229,7 @@ cluster_opts = [
"network connection")),
]
nsx_common_opts = [
cfg.StrOpt('nsx_l2gw_driver',
help=_("Specify the class path for the Layer 2 gateway "
@ -251,8 +252,17 @@ nsx_common_opts = [
help=_("An ordered list of extension driver "
"entrypoints to be loaded from the "
"vmware_nsx.extension_drivers namespace.")),
cfg.StrOpt('smtp_gateway',
help=_("(Optional) IP address of SMTP gateway to use for"
"admin warnings.")),
cfg.StrOpt('smtp_from_addr',
help=_("(Optional) email address to use for outgoing admin"
"notifications.")),
cfg.ListOpt('snmp_to_list',
default=[],
help=_("(Optional) List of email addresses for "
"notifications.")),
]
nsx_v3_opts = [
cfg.ListOpt('nsx_api_user',
default=['admin'],

View File

@ -34,6 +34,12 @@ RESOURCE_ATTRIBUTE_MAP = {
'allow_post': False, 'allow_put': False, 'is_visible': True},
'enabled': {
'allow_post': False, 'allow_put': False, 'is_visible': True},
'error_count': {
'allow_post': False, 'allow_put': False, 'is_visible': True},
'fixed_count': {
'allow_post': False, 'allow_put': False, 'is_visible': True},
'error_info': {
'allow_post': False, 'allow_put': False, 'is_visible': True},
}
}

View File

@ -59,3 +59,23 @@ class BaseJob(object):
@abc.abstractmethod
def get_project_plugin(self, plugin):
pass
def housekeeper_info(info, fmt, *args):
msg = fmt % args
if info:
info = "%s\n%s" % (info, msg)
else:
info = msg
LOG.info("Housekeeping: %s", msg)
return info
def housekeeper_warning(info, fmt, *args):
msg = fmt % args
if info:
info = "%s\n%s" % (info, msg)
else:
info = msg
LOG.warning("Housekeeping: %s", msg)
return info

View File

@ -13,6 +13,11 @@
# License for the specific language governing permissions and limitations
# under the License.
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from oslo_config import cfg
from oslo_log import log
import stevedore
@ -24,12 +29,23 @@ LOG = log.getLogger(__name__)
ALL_DUMMY_JOB = {
'name': 'all',
'description': 'Execute all housekeepers',
'enabled': True}
'enabled': True,
'error_count': 0,
'fixed_count': 0,
'error_info': None}
class NsxvHousekeeper(stevedore.named.NamedExtensionManager):
def __init__(self, hk_ns, hk_jobs):
self.email_notifier = None
if (cfg.CONF.smtp_gateway and
cfg.CONF.smtp_from_addr and
cfg.CONF.snmp_to_list):
self.email_notifier = HousekeeperEmailNotifier()
self.readonly = cfg.CONF.nsxv.housekeeping_readonly
self.results = {}
if self.readonly:
LOG.info('Housekeeper initialized in readonly mode')
else:
@ -45,40 +61,140 @@ class NsxvHousekeeper(stevedore.named.NamedExtensionManager):
self.jobs[job.obj.get_name()] = job.obj
def get(self, job_name):
if job_name == ALL_DUMMY_JOB.get('name'):
return ALL_DUMMY_JOB
if job_name == ALL_DUMMY_JOB['name']:
return {'name': job_name,
'description': ALL_DUMMY_JOB['description'],
'enabled': job_name in self.jobs,
'error_count': self.results.get(
job_name, {}).get('error_count', 0),
'fixed_count': self.results.get(
job_name, {}).get('fixed_count', 0),
'error_info': self.results.get(
job_name, {}).get('error_info', '')}
for job in self:
name = job.obj.get_name()
if job_name == name:
return {'name': job_name,
'description': job.obj.get_description(),
'enabled': job_name in self.jobs}
'enabled': job_name in self.jobs,
'error_count': self.results.get(
job_name, {}).get('error_count', 0),
'fixed_count': self.results.get(
job_name, {}).get('fixed_count', 0),
'error_info': self.results.get(
job_name, {}).get('error_info', '')}
raise n_exc.ObjectNotFound(id=job_name)
def list(self):
results = [ALL_DUMMY_JOB]
results = [{'name': ALL_DUMMY_JOB['name'],
'description': ALL_DUMMY_JOB['description'],
'enabled': ALL_DUMMY_JOB['name'] in self.jobs,
'error_count': self.results.get(
ALL_DUMMY_JOB['name'], {}).get('error_count', 0),
'fixed_count': self.results.get(
ALL_DUMMY_JOB['name'], {}).get('fixed_count', 0),
'error_info': self.results.get(
ALL_DUMMY_JOB['name'], {}).get('error_info', '')}]
for job in self:
job_name = job.obj.get_name()
results.append({'name': job_name,
'description': job.obj.get_description(),
'enabled': job_name in self.jobs})
'enabled': job_name in self.jobs,
'error_count': self.results.get(
job_name, {}).get('error_count', 0),
'fixed_count': self.results.get(
job_name, {}).get('fixed_count', 0),
'error_info': self.results.get(
job_name, {}).get('error_info', '')})
return results
def run(self, context, job_name):
self.results = {}
if context.is_admin:
if self.email_notifier:
self.email_notifier.start('Cloud Housekeeper Execution Report')
with locking.LockManager.get_lock('nsx-housekeeper'):
error_count = 0
fixed_count = 0
error_info = ''
if job_name == ALL_DUMMY_JOB.get('name'):
for job in self.jobs.values():
job.run(context)
result = job.run(context)
if result:
if self.email_notifier and result['error_count']:
self._add_job_text_to_notifier(job, result)
error_count += result['error_count']
fixed_count += result['fixed_count']
error_info += result['error_info'] + "\n"
self.results[job_name] = {
'error_count': error_count,
'fixed_count': fixed_count,
'error_info': error_info
}
else:
job = self.jobs.get(job_name)
if job:
job.run(context)
result = job.run(context)
if result:
error_count = result['error_count']
if self.email_notifier:
self._add_job_text_to_notifier(job, result)
self.results[job.get_name()] = result
else:
raise n_exc.ObjectNotFound(id=job_name)
if self.email_notifier and error_count:
self.email_notifier.send()
else:
raise n_exc.AdminRequired()
def _add_job_text_to_notifier(self, job, result):
self.email_notifier.add_text("<b>%s:</b>", job.get_name())
self.email_notifier.add_text(
'%d errors found, %d fixed\n%s\n\n',
result['error_count'],
result['fixed_count'],
result['error_info'])
class HousekeeperEmailNotifier(object):
def __init__(self):
self.msg = None
self.html = None
self.has_text = False
def start(self, subject):
self.msg = MIMEMultipart('alternative')
self.msg['Subject'] = subject
self.msg['From'] = cfg.CONF.smtp_from_addr
self.msg['To'] = ', '.join(cfg.CONF.snmp_to_list)
self.html = '<html><div>'
self.has_text = False
def add_text(self, fmt, *args):
self.has_text = True
text = fmt % args
LOG.debug("Housekeeper emailer adding text %s", text)
self.html += text.replace("\n", "<br>") + "<br>\n"
def send(self):
if self.has_text:
self.html += "</div></html>"
part1 = MIMEText(self.html, 'html')
self.msg.attach(part1)
s = smtplib.SMTP(cfg.CONF.smtp_gateway)
s.sendmail(cfg.CONF.smtp_from_addr,
cfg.CONF.snmp_to_list,
self.msg.as_string())
s.quit()
self.msg = None
self.html = None

View File

@ -44,6 +44,9 @@ class ErrorBackupEdgeJob(base_job.BaseJob):
def run(self, context):
super(ErrorBackupEdgeJob, self).run(context)
error_count = 0
fixed_count = 0
error_info = ''
# Gather ERROR state backup edges into dict
filters = {'status': [constants.ERROR]}
@ -54,20 +57,30 @@ class ErrorBackupEdgeJob(base_job.BaseJob):
if not error_edge_bindings:
LOG.debug('Housekeeping: no backup edges in ERROR state detected')
return
return {'error_count': 0,
'fixed_count': 0,
'error_info': 'No backup edges in ERROR state detected'}
# Keep list of current broken backup edges - as it may change while
# HK is running
for binding in error_edge_bindings:
LOG.warning('Housekeeping: Backup Edge appliance %s is in ERROR'
' state', binding['edge_id'])
error_count += 1
error_info = base_job.housekeeper_warning(
error_info, 'Backup Edge appliance %s is in ERROR state',
binding['edge_id'])
if not self.readonly:
with locking.LockManager.get_lock(binding['edge_id']):
self._handle_backup_edge(context, binding)
if self._handle_backup_edge(context, binding):
fixed_count += 1
return {'error_count': error_count,
'fixed_count': fixed_count,
'error_info': error_info}
def _handle_backup_edge(self, context, binding):
dist = (binding['edge_type'] == nsxv_constants.VDR_EDGE)
result = True
az = self.azs.get_availability_zone(
binding['availability_zone'])
try:
@ -90,7 +103,9 @@ class ErrorBackupEdgeJob(base_job.BaseJob):
if not update_result:
LOG.warning('Housekeeping: failed to recover Edge '
'appliance %s, trying to delete', binding['edge_id'])
self._delete_edge(context, binding, dist)
result = self._delete_edge(context, binding, dist)
return result
def _delete_edge(self, context, binding, dist):
try:
@ -104,6 +119,8 @@ class ErrorBackupEdgeJob(base_job.BaseJob):
try:
self.plugin.nsx_v.delete_edge(context, binding['router_id'],
binding['edge_id'], dist=dist)
return True
except Exception as e:
LOG.warning('Housekeeping: Failed to delete edge %s with '
'exception %s', binding['edge_id'], e)

View File

@ -27,6 +27,12 @@ LOG = log.getLogger(__name__)
class ErrorDhcpEdgeJob(base_job.BaseJob):
def __init__(self, readonly):
super(ErrorDhcpEdgeJob, self).__init__(readonly)
self.error_count = 0
self.fixed_count = 0
self.fixed_sub_if_count = 0
self.error_info = ''
def get_project_plugin(self, plugin):
return plugin.get_plugin_by_type(projectpluginmap.NsxPlugins.NSX_V)
@ -39,6 +45,10 @@ class ErrorDhcpEdgeJob(base_job.BaseJob):
def run(self, context):
super(ErrorDhcpEdgeJob, self).run(context)
self.error_count = 0
self.fixed_count = 0
self.fixed_sub_if_count = 0
self.error_info = ''
# Gather ERROR state DHCP edges into dict
filters = {'status': [constants.ERROR]}
@ -47,7 +57,9 @@ class ErrorDhcpEdgeJob(base_job.BaseJob):
if not error_edge_bindings:
LOG.debug('Housekeeping: no DHCP edges in ERROR state detected')
return
return {'error_count': self.error_count,
'fixed_count': self.fixed_count,
'error_info': 'No DHCP error state edges detected'}
with locking.LockManager.get_lock('nsx-dhcp-edge-pool'):
edge_dict = {}
@ -70,8 +82,14 @@ class ErrorDhcpEdgeJob(base_job.BaseJob):
self._validate_dhcp_edge(
context, edge_dict, pfx_dict, networks, edge_id)
except Exception as e:
LOG.error('Failed to recover DHCP Edge %s (%s)',
edge_id, e)
self.error_count += 1
self.error_info = base_job.housekeeper_warning(
self.error_info,
'Failed to recover DHCP Edge %s (%s)', edge_id, e)
return {'error_count': self.error_count,
'fixed_count': self.fixed_count,
'error_info': self.error_info}
def _validate_dhcp_edge(
self, context, edge_dict, pfx_dict, networks, edge_id):
@ -95,21 +113,29 @@ class ErrorDhcpEdgeJob(base_job.BaseJob):
if net_id is None:
# Delete router binding as we do not have such network
# in Neutron
LOG.warning('Housekeeping: router binding %s for edge '
'%s has no matching neutron network',
router_id, edge_id)
self.error_count += 1
self.error_info = base_job.housekeeper_warning(
self.error_info,
'router binding %s for edge %s has no matching '
'neutron network', router_id, edge_id)
if not self.readonly:
nsxv_db.delete_nsxv_router_binding(
context.session, binding['router_id'])
self.fixed_count += 1
else:
if net_id not in edge_networks:
# Create vNic bind here
LOG.warning('Housekeeping: edge %s vnic binding '
'missing for network %s', edge_id,
net_id)
self.error_count += 1
self.error_info = base_job.housekeeper_warning(
self.error_info,
'edge %s vnic binding missing for network %s',
edge_id, net_id)
if not self.readonly:
nsxv_db.allocate_edge_vnic_with_tunnel_index(
context.session, edge_id, net_id, az_name)
self.fixed_count += 1
# Step (B)
# Find vNic bindings which reference invalid networks or aren't
@ -122,12 +148,16 @@ class ErrorDhcpEdgeJob(base_job.BaseJob):
for bind in vnic_binds:
if bind['network_id'] not in networks:
LOG.warning('Housekeeping: edge vnic binding for edge '
'%s is for invalid network id %s',
edge_id, bind['network_id'])
self.error_count += 1
self.error_info = base_job.housekeeper_warning(
self.error_info,
'edge vnic binding for edge %s is for invalid '
'network id %s', edge_id, bind['network_id'])
if not self.readonly:
nsxv_db.free_edge_vnic_by_network(
context.session, edge_id, bind['network_id'])
self.fixed_count += 1
# Step (C)
# Verify that backend is in sync with Neutron
@ -158,6 +188,8 @@ class ErrorDhcpEdgeJob(base_job.BaseJob):
self._update_router_bindings(context, edge_id)
self.fixed_count += self.fixed_sub_if_count
def _validate_edge_subinterfaces(self, context, edge_id, backend_vnics,
vnic_dict, if_changed):
# Validate that all the interfaces on the Edge
@ -175,11 +207,13 @@ class ErrorDhcpEdgeJob(base_job.BaseJob):
vnic_bind['tunnel_index'] == sub_if['tunnelId']):
pass
else:
LOG.warning('Housekeeping: subinterface %s for vnic '
'%s on edge %s is not defined in '
'nsxv_edge_vnic_bindings',
sub_if['tunnelId'],
vnic['index'], edge_id)
self.error_count += 1
self.error_info = base_job.housekeeper_warning(
self.error_info,
'subinterface %s for vnic %s on edge %s is not '
'defined in nsxv_edge_vnic_bindings',
sub_if['tunnelId'], vnic['index'], edge_id)
self.fixed_sub_if_count += 1
if_changed[vnic['index']] = True
vnic['subInterfaces']['subInterfaces'].remove(sub_if)
@ -210,27 +244,34 @@ class ErrorDhcpEdgeJob(base_job.BaseJob):
if sub_if['tunnelId'] == tunnel_index:
found = True
if sub_if.get('logicalSwitchName') != network_id:
LOG.warning('Housekeeping: subinterface %s on '
'vnic %s on edge %s should be '
'connected to network %s',
tunnel_index, vnic['index'],
edge_id, network_id)
self.error_count += 1
self.error_info = base_job.housekeeper_warning(
self.error_info,
'subinterface %s on vnic %s on edge %s '
'should be connected to network %s',
tunnel_index, vnic['index'], edge_id,
network_id)
if_changed[vnic['index']] = True
if not self.readonly:
self._recreate_vnic_subinterface(
context, network_id, edge_id, vnic,
tunnel_index)
self.fixed_count += 1
sub_if['name'] = network_id
if not found:
LOG.warning('Housekeeping: subinterface %s on vnic '
'%s on edge %s should be connected to '
'network %s but is missing', tunnel_index,
vnic['index'], edge_id, network_id)
self.error_count += 1
self.error_info = base_job.housekeeper_warning(
self.error_info,
'subinterface %s on vnic %s on edge %s should be '
'connected to network %s but is missing',
tunnel_index, vnic['index'], edge_id, network_id)
if_changed[vnic['index']] = True
if not self.readonly:
self._recreate_vnic_subinterface(
context, network_id, edge_id, vnic,
tunnel_index)
self.fixed_sub_if_count += 1
def _recreate_vnic_subinterface(
self, context, network_id, edge_id, vnic, tunnel_index):

View File

@ -49,6 +49,9 @@ class LbaasPendingJob(base_job.BaseJob):
def run(self, context):
super(LbaasPendingJob, self).run(context)
curr_time = time.time()
error_count = 0
fixed_count = 0
error_info = ''
for model in self.lbaas_models:
sess = context.session
@ -65,11 +68,15 @@ class LbaasPendingJob(base_job.BaseJob):
if lifetime > ELEMENT_LIFETIME:
# Entry has been pending for more than lifetime.
# Report and remove when in R/W mode
LOG.warning('Housekeeping: LBaaS %s %s is stuck in '
'pending state',
model.NAME, element['id'])
error_count += 1
error_info = base_job.housekeeper_warning(
error_info,
'LBaaS %s %s is stuck in pending state',
model.NAME, element['id'])
if not self.readonly:
element['provisioning_status'] = constants.ERROR
fixed_count += 1
del self.lbaas_objects[element['id']]
else:
# Entry is still pending but haven't reached lifetime
@ -93,3 +100,9 @@ class LbaasPendingJob(base_job.BaseJob):
LOG.debug('Housekeeping: LBaaS %s %s is back to normal',
self.lbaas_objects[obj_id]['model'].NAME, obj_id)
del self.lbaas_objects[obj_id]
if error_count == 0:
error_info = 'No LBaaS objects in pending state'
return {'error_count': error_count,
'fixed_count': fixed_count,
'error_info': error_info}

View File

@ -17,6 +17,7 @@ import mock
from neutron.tests import base
from neutron_lib.plugins import constants
from vmware_nsx.plugins.common.housekeeper import base_job
from vmware_nsx.plugins.nsx_v.housekeeper import error_backup_edge
FAKE_ROUTER_BINDINGS = [
@ -42,7 +43,7 @@ class ErrorBackupEdgeTestCaseReadOnly(base.BaseTestCase):
mock.patch('neutron_lib.plugins.directory.get_plugin',
side_effect=get_plugin_mock).start()
self.log = mock.Mock()
error_backup_edge.LOG = self.log
base_job.LOG = self.log
self.job = error_backup_edge.ErrorBackupEdgeJob(self._is_readonly())
def test_clean_run(self):

View File

@ -20,6 +20,7 @@ import mock
from neutron.tests import base
from neutron_lib.plugins import constants
from vmware_nsx.plugins.common.housekeeper import base_job
from vmware_nsx.plugins.nsx_v.housekeeper import error_dhcp_edge
FAKE_ROUTER_BINDINGS = [
@ -289,7 +290,7 @@ class ErrorDhcpEdgeTestCaseReadOnly(base.BaseTestCase):
mock.patch.object(self.plugin, 'get_availability_zone_name_by_edge',
return_value='default').start()
self.log = mock.Mock()
error_dhcp_edge.LOG = self.log
base_job.LOG = self.log
self.job = error_dhcp_edge.ErrorDhcpEdgeJob(self._is_readonly())
def test_clean_run(self):