NSXv: admin util metadata breakage recovery

Due to neutron bug, some metadata components in the various backend Edge
appliances are missing. The patch is supposed to address these
issues.

Admin util command can run per Edge, per AZ or for the whole cloud.

Cases handled by the utility:
- Existing metadata proxies' internal IP is different than the IPs which are
defined in the Edge's loadbalancer object.
This case can happen when the metadata proxies are recreated for some reason.

- Edge appliance is lacking the metadata network connectivity, and the
loadbalancer objects.
This case can happen while a router or a DHCP was created by the Neutron
parent process, which failed to initialize with metadata due to a bug.

- The Edge is missing the metadata firewall rules.
This case can happen while the first interface attachment to the router was
done in the Neutron parent process context due to the bug described above.

Command syntax:
Update AZ:
    nsxadmin -r metadata -o nsx-update --property az-name=az123

Update single Edge appliance:
    nsxadmin -r metadata -o nsx-update --property edge-id=edge-15

Update entire cloud:
    nsxadmin -r metadata -o nsx-update

Change-Id: I77de9e0a0c627e43d3b1c95573d151e0414a34a9
This commit is contained in:
Kobi Samoray 2019-03-12 11:53:12 +02:00
parent 586f4d0f1d
commit 0e97278c8a
2 changed files with 243 additions and 61 deletions

View File

@ -287,10 +287,18 @@ Security Groups, Firewall and Spoofguard
Metadata
~~~~~~~~
- Update loadbalancer members on router and DHCP edges::
- Update metadata infrastructure on all router and DHCP edges::
nsxadmin -r metadata -o nsx-update
- Update metadata infrastructure on availability zone's router and DHCP edges::
nsxadmin -r metadata -o nsx-update --property az-name=az123
- Update metadata infrastructure on specific router or DHCP edge::
nsxadmin -r metadata -o nsx-update --property edge-id=edge-15
- Update shared secret on router and DHCP edges::
nsxadmin -r metadata -o nsx-update-secret

View File

@ -29,6 +29,7 @@ from vmware_nsx.plugins.nsx_v import availability_zones as nsx_az
from vmware_nsx.plugins.nsx_v import md_proxy
from vmware_nsx.plugins.nsx_v.vshield.common import constants as vcns_constants
from vmware_nsx.plugins.nsx_v.vshield import nsxv_loadbalancer as nsxv_lb
from vmware_nsx.services.lbaas.nsx_v import lbaas_common as lb_common
from vmware_nsx.shell.admin.plugins.common import constants
from vmware_nsx.shell.admin.plugins.common import formatters
from vmware_nsx.shell.admin.plugins.common import utils as admin_utils
@ -36,100 +37,273 @@ from vmware_nsx.shell.admin.plugins.nsxv.resources import utils as utils
from vmware_nsx.shell import resources as shell
NSXV_MD_RULES = [
{'name': 'MDServiceIP',
'destination': {'ipAddress': ['169.254.169.254']},
'enabled': True,
'application': {'service': [{'protocol': 'tcp',
'port': [80, 443, 8775]}]},
'action': 'accept',
'ruleTag': None},
{'name': 'MDInterEdgeNet',
'destination': {'ipAddress': ['169.254.128.0/17']},
'enabled': True,
'action': 'deny',
'ruleTag': None}]
LOG = logging.getLogger(__name__)
nsxv = utils.get_nsxv_client()
def _append_md_fw_rules(fw_rules):
# Set FW rules tags
NSXV_MD_RULES[0]['ruleTag'] = len(fw_rules) + 1
NSXV_MD_RULES[1]['ruleTag'] = len(fw_rules) + 2
fw_rules += NSXV_MD_RULES
return fw_rules
def _handle_edge_firewall_rules(edge_id):
try:
h, fw_cfg = nsxv.get_firewall(edge_id)
except Exception as e:
fw_cfg = {}
LOG.error("Failed to retrieve firewall config for edge %(edge)s "
"with exception %(e)s", {'edge': edge_id, 'e': e})
do_update = True
fw_rules = fw_cfg.get('firewallRules', {}).get('firewallRules', [])
for rule in fw_rules:
if rule['name'] in ['MDInterEdgeNet', 'MDServiceIP']:
do_update = False
break
if do_update:
fw_rules = _append_md_fw_rules(fw_rules)
fw_cfg['firewallRules']['firewallRules'] = fw_rules
try:
nsxv.update_firewall(edge_id, fw_cfg)
LOG.info('Added missing firewall rules for edge %s', edge_id)
except Exception as e:
LOG.warning("Failed to update firewall config for edge "
"%(edge)s with exception %(e)s",
{'edge': edge_id, 'e': e})
def _recreate_rtr_metadata_cfg(context, plugin, az_name, edge_id):
rtr_binding = nsxv_db.get_nsxv_router_binding_by_edge(
context.session, edge_id)
md_handler = plugin.metadata_proxy_handler[az_name]
if md_handler:
try:
md_handler.configure_router_edge(
context, rtr_binding['router_id'])
LOG.info('Added metadata components for edge %s',
edge_id)
except Exception as e:
LOG.error('Recreation of metadata components for edge '
'%(edge)s failed with error %(e)s',
{'edge': edge_id, 'e': e})
def _update_md_lb_members(edge_id, edge_internal_ips, lb, pool):
LOG.info('Updating metadata members for edge %s', edge_id)
pool.members = {}
i = 0
s_port = cfg.CONF.nsxv.nova_metadata_port
for member_ip in edge_internal_ips:
i += 1
member = nsxv_lb.NsxvLBPoolMember(
name='Member-%d' % i,
ip_address=member_ip,
port=s_port,
monitor_port=s_port)
pool.add_member(member)
try:
lb.submit_to_backend(nsxv, edge_id)
LOG.info('Updated members for %s', edge_id)
except Exception as e:
LOG.error('Updating members for %(edge)s failed with '
'error %(e)s', {'edge': edge_id, 'e': e})
def _get_internal_edge_ips(context, az_name):
# Get the list of internal networks for this AZ
db_net = nsxv_db.get_nsxv_internal_network_for_az(
context.session,
vcns_constants.InternalEdgePurposes.INTER_EDGE_PURPOSE,
az_name)
internal_net = None
internal_subnet = None
if db_net:
internal_net = db_net['network_id']
internal_subnet = context.session.query(
models_v2.Subnet).filter_by(
network_id=internal_net).first().get('id')
# Get the list of internal edges for this AZ
edge_list = nsxv_db.get_nsxv_internal_edges_by_purpose(
context.session,
vcns_constants.InternalEdgePurposes.INTER_EDGE_PURPOSE)
edge_az_list = [edge for edge in edge_list if
nsxv_db.get_router_availability_zone(
context.session, edge['router_id']) == az_name]
md_rtr_ids = [edge['router_id'] for edge in edge_az_list]
edge_internal_ips = []
for edge in edge_az_list:
edge_internal_port = context.session.query(
models_v2.Port).filter_by(network_id=internal_net,
device_id=edge['router_id']).first()
if edge_internal_port:
edge_internal_ip = context.session.query(
models_v2.IPAllocation).filter_by(
port_id=edge_internal_port['id']).first()
edge_internal_ips.append(edge_internal_ip['ip_address'])
if not internal_net or not internal_subnet or not edge_internal_ips:
return None, None
LOG.info('Metadata proxy internal IPs are %s', edge_internal_ips)
return edge_internal_ips, md_rtr_ids
def _handle_edge(context, plugin, az_name, edge_id, edge_internal_ips):
with locking.LockManager.get_lock(edge_id):
lb = nsxv_lb.NsxvLoadbalancer.get_loadbalancer(nsxv, edge_id)
virt = lb.virtual_servers.get(md_proxy.METADATA_VSE_NAME)
if virt:
pool = virt.default_pool
curr_member_ips = [member.payload['ipAddress'] for member in
pool.members.values()]
if set(curr_member_ips) != set(edge_internal_ips):
_update_md_lb_members(edge_id, edge_internal_ips, lb, pool)
else:
# Interface connectivity and LB definition are done at the same
# operation. if LB is missing then interface should be missing
# as well
LOG.info('Metadata LB components for edge %s are missing',
edge_id)
_recreate_rtr_metadata_cfg(context, plugin, az_name, edge_id)
_handle_edge_firewall_rules(edge_id)
@admin_utils.output_header
def nsx_redo_metadata_cfg(resource, event, trigger, **kwargs):
properties = admin_utils.parse_multi_keyval_opt(kwargs.get('property'))
edgeapi = utils.NeutronDbClient()
plugin = utils.NsxVPluginWrapper()
edge_id = properties.get('edge-id')
if properties:
if edge_id:
nsx_redo_metadata_cfg_for_edge(edgeapi.context, plugin, edge_id)
return
else:
# if the net-id property exist - recreate the edge for this network
az_name = properties.get('az-name')
if az_name:
nsx_redo_metadata_cfg_for_az(edgeapi.context, plugin, az_name)
return
LOG.error('Cannot parse properties %s', properties)
return
nsx_redo_metadata_cfg_all(edgeapi.context, plugin)
def nsx_redo_metadata_cfg_for_edge(context, plugin, edge_id):
binding = nsxv_db.get_nsxv_router_binding_by_edge(context.session, edge_id)
if binding:
az_name = binding['availability_zone']
conf_az = nsx_az.NsxVAvailabilityZones()
az = conf_az.availability_zones[az_name]
if not az.supports_metadata():
LOG.error('Edge %(edge)s belongs to az %(az)s which does not '
'support metadata',
{'az': az_name, 'edge': edge_id})
edge_internal_ips, md_rtr_ids = _get_internal_edge_ips(context,
az_name)
if binding['router_id'] in md_rtr_ids:
LOG.error('Edge %s is a metadata proxy', edge_id)
return
if (binding['router_id'].startswith(
vcns_constants.BACKUP_ROUTER_PREFIX) or
binding['router_id'].startswith(
vcns_constants.PLR_EDGE_PREFIX)or
binding['router_id'].startswith(
lb_common.RESOURCE_ID_PFX)):
LOG.error('Edge %s is not a metadata delivery appliance', edge_id)
return
_handle_edge(context, plugin, az_name, edge_id, edge_internal_ips)
else:
LOG.error('No edge binding found for edge %s', edge_id)
@admin_utils.output_header
def nsx_redo_metadata_cfg_all(context, plugin):
user_confirm = admin_utils.query_yes_no("Do you want to setup metadata "
"infrastructure for all the edges",
default="no")
if not user_confirm:
LOG.info("NSXv vnics deletion aborted by user")
return
config.register_nsxv_azs(cfg.CONF, cfg.CONF.nsxv.availability_zones)
conf_az = nsx_az.NsxVAvailabilityZones()
az_list = conf_az.list_availability_zones_objects()
for az in az_list:
if az.supports_metadata():
nsx_redo_metadata_cfg_for_az(az, edgeapi)
nsx_redo_metadata_cfg_for_az(context, plugin, az.name, False)
else:
LOG.info("Skipping availability zone: %s - no metadata "
"configuration", az.name)
def nsx_redo_metadata_cfg_for_az(az, edgeapi):
LOG.info("Updating MetaData for availability zone: %s", az.name)
def nsx_redo_metadata_cfg_for_az(context, plugin, az_name, check_az=True):
LOG.info("Updating MetaData for availability zone: %s", az_name)
# Get the list of internal networks for this AZ
db_net = nsxv_db.get_nsxv_internal_network_for_az(
edgeapi.context.session,
vcns_constants.InternalEdgePurposes.INTER_EDGE_PURPOSE,
az.name)
if check_az:
conf_az = nsx_az.NsxVAvailabilityZones()
az = conf_az.availability_zones.get(az_name)
if not az:
LOG.error('Availability zone %s not found', az_name)
return
if not az.supports_metadata():
LOG.error('Availability zone %s is not configured with metadata',
az_name)
return
internal_net = None
internal_subnet = None
if db_net:
internal_net = db_net['network_id']
internal_subnet = edgeapi.context.session.query(
models_v2.Subnet).filter_by(
network_id=internal_net).first().get('id')
# Get the list of internal edges for this AZ
edge_list = nsxv_db.get_nsxv_internal_edges_by_purpose(
edgeapi.context.session,
vcns_constants.InternalEdgePurposes.INTER_EDGE_PURPOSE)
edge_az_list = [edge for edge in edge_list if
nsxv_db.get_router_availability_zone(
edgeapi.context.session, edge['router_id']) == az.name]
md_rtr_ids = [edge['router_id'] for edge in edge_az_list]
edge_internal_ips = []
for edge in edge_az_list:
edge_internal_port = edgeapi.context.session.query(
models_v2.Port).filter_by(network_id=internal_net,
device_id=edge['router_id']).first()
if edge_internal_port:
edge_internal_ip = edgeapi.context.session.query(
models_v2.IPAllocation).filter_by(
port_id=edge_internal_port['id']).first()
edge_internal_ips.append(edge_internal_ip['ip_address'])
if not internal_net or not internal_subnet or not edge_internal_ips:
edge_internal_ips, md_rtr_ids = _get_internal_edge_ips(context,
az_name)
if not edge_internal_ips and not md_rtr_ids:
LOG.error("Metadata infrastructure is missing or broken. "
"It is recommended to restart neutron service before "
"proceeding with configuration restoration")
return
router_bindings = nsxv_db.get_nsxv_router_bindings(
edgeapi.context.session,
context.session,
filters={'edge_type': [nsxv_constants.SERVICE_EDGE],
'availability_zone': az.name})
'availability_zone': [az_name]})
edge_ids = list(set([binding['edge_id'] for binding in router_bindings
if (binding['router_id'] not in set(md_rtr_ids) and
not binding['router_id'].startswith(
vcns_constants.BACKUP_ROUTER_PREFIX) and
not binding['router_id'].startswith(
vcns_constants.PLR_EDGE_PREFIX))]))
vcns_constants.PLR_EDGE_PREFIX)and
not binding['router_id'].startswith(
lb_common.RESOURCE_ID_PFX))]))
for edge_id in edge_ids:
with locking.LockManager.get_lock(edge_id):
lb = nsxv_lb.NsxvLoadbalancer.get_loadbalancer(nsxv, edge_id)
virt = lb.virtual_servers.get(md_proxy.METADATA_VSE_NAME)
if virt:
pool = virt.default_pool
pool.members = {}
i = 0
s_port = cfg.CONF.nsxv.nova_metadata_port
for member_ip in edge_internal_ips:
i += 1
member = nsxv_lb.NsxvLBPoolMember(
name='Member-%d' % i,
ip_address=member_ip,
port=s_port,
monitor_port=s_port)
pool.add_member(member)
lb.submit_to_backend(nsxv, edge_id)
_handle_edge(context, plugin, az_name, edge_id, edge_internal_ips)
@admin_utils.output_header