Improve l3-agent performance and prevent races in it.
Fixes bug 1194026 Fixes bug 1200749 Introduce a looping call for performing synchronization with neutron server. The sync will be performed only if router changes are notified via rpc. Only affected routers will be synchronized. Changes will be implemented by the l3 agent spawning a distinct greenthread for each router - iptables will be executed only once using iptables_manager.defer_apply_on. This patch will prevent the occurence of the following issues: - Out-of-order rpc message processing - Long processing time for router changes due to serial execution - Occasional and expected RPC blocks for long periods - Unnecessary processing of multiple requests Change-Id: I0978d1c38ac5c38c4548e5b1877857bb5cac3b81
This commit is contained in:
parent
2890f4c956
commit
39b125fbe9
@ -18,7 +18,6 @@
|
||||
#
|
||||
|
||||
import eventlet
|
||||
from eventlet import semaphore
|
||||
import netaddr
|
||||
from oslo.config import cfg
|
||||
|
||||
@ -36,6 +35,7 @@ from neutron.common import utils as common_utils
|
||||
from neutron import context
|
||||
from neutron import manager
|
||||
from neutron.openstack.common import importutils
|
||||
from neutron.openstack.common import lockutils
|
||||
from neutron.openstack.common import log as logging
|
||||
from neutron.openstack.common import loopingcall
|
||||
from neutron.openstack.common import periodic_task
|
||||
@ -49,6 +49,7 @@ LOG = logging.getLogger(__name__)
|
||||
NS_PREFIX = 'qrouter-'
|
||||
INTERNAL_DEV_PREFIX = 'qr-'
|
||||
EXTERNAL_DEV_PREFIX = 'qg-'
|
||||
RPC_LOOP_INTERVAL = 1
|
||||
|
||||
|
||||
class L3PluginApi(proxy.RpcProxy):
|
||||
@ -66,9 +67,8 @@ class L3PluginApi(proxy.RpcProxy):
|
||||
topic=topic, default_version=self.BASE_RPC_API_VERSION)
|
||||
self.host = host
|
||||
|
||||
def get_routers(self, context, fullsync=True, router_id=None):
|
||||
def get_routers(self, context, fullsync=True, router_ids=None):
|
||||
"""Make a remote process call to retrieve the sync data for routers."""
|
||||
router_ids = [router_id] if router_id else None
|
||||
return self.call(context,
|
||||
self.make_msg('sync_routers', host=self.host,
|
||||
fullsync=fullsync,
|
||||
@ -140,6 +140,17 @@ class RouterInfo(object):
|
||||
|
||||
|
||||
class L3NATAgent(manager.Manager):
|
||||
"""Manager for L3NatAgent
|
||||
|
||||
API version history:
|
||||
1.0 initial Version
|
||||
1.1 changed the type of the routers parameter
|
||||
to the routers_updated method.
|
||||
It was previously a list of routers in dict format.
|
||||
It is now a list of router IDs only.
|
||||
Per rpc versioning rules, it is backwards compatible.
|
||||
"""
|
||||
RPC_API_VERSION = '1.1'
|
||||
|
||||
OPTS = [
|
||||
cfg.StrOpt('external_network_bridge', default='br-ex',
|
||||
@ -196,9 +207,15 @@ class L3NATAgent(manager.Manager):
|
||||
self.context = context.get_admin_context_without_session()
|
||||
self.plugin_rpc = L3PluginApi(topics.PLUGIN, host)
|
||||
self.fullsync = True
|
||||
self.sync_sem = semaphore.Semaphore(1)
|
||||
self.updated_routers = set()
|
||||
self.removed_routers = set()
|
||||
self.sync_progress = False
|
||||
if self.conf.use_namespaces:
|
||||
self._destroy_router_namespaces(self.conf.router_id)
|
||||
|
||||
self.rpc_loop = loopingcall.FixedIntervalLoopingCall(
|
||||
self._rpc_loop)
|
||||
self.rpc_loop.start(interval=RPC_LOOP_INTERVAL)
|
||||
super(L3NATAgent, self).__init__(host=self.conf.host)
|
||||
|
||||
def _destroy_router_namespaces(self, only_router_id=None):
|
||||
@ -323,6 +340,7 @@ class L3NATAgent(manager.Manager):
|
||||
port['ip_cidr'] = "%s/%s" % (ips[0]['ip_address'], prefixlen)
|
||||
|
||||
def process_router(self, ri):
|
||||
ri.iptables_manager.defer_apply_on()
|
||||
ex_gw_port = self._get_ex_gw_port(ri)
|
||||
internal_ports = ri.router.get(l3_constants.INTERFACE_KEY, [])
|
||||
existing_port_ids = set([p['id'] for p in ri.internal_ports])
|
||||
@ -371,6 +389,7 @@ class L3NATAgent(manager.Manager):
|
||||
ri.ex_gw_port = ex_gw_port
|
||||
ri.enable_snat = ri.router.get('enable_snat')
|
||||
self.routes_updated(ri)
|
||||
ri.iptables_manager.defer_apply_off()
|
||||
|
||||
def _handle_router_snat_rules(self, ri, ex_gw_port, internal_cidrs,
|
||||
interface_name, action):
|
||||
@ -586,35 +605,28 @@ class L3NATAgent(manager.Manager):
|
||||
|
||||
def router_deleted(self, context, router_id):
|
||||
"""Deal with router deletion RPC message."""
|
||||
with self.sync_sem:
|
||||
if router_id in self.router_info:
|
||||
try:
|
||||
self._router_removed(router_id)
|
||||
except Exception:
|
||||
msg = _("Failed dealing with router "
|
||||
"'%s' deletion RPC message")
|
||||
LOG.debug(msg, router_id)
|
||||
self.fullsync = True
|
||||
LOG.debug(_('Got router deleted notification for %s'), router_id)
|
||||
self.removed_routers.add(router_id)
|
||||
|
||||
def routers_updated(self, context, routers):
|
||||
"""Deal with routers modification and creation RPC message."""
|
||||
if not routers:
|
||||
return
|
||||
with self.sync_sem:
|
||||
try:
|
||||
self._process_routers(routers)
|
||||
except Exception:
|
||||
msg = _("Failed dealing with routers update RPC message")
|
||||
LOG.debug(msg)
|
||||
self.fullsync = True
|
||||
LOG.debug(_('Got routers updated notification :%s'), routers)
|
||||
if routers:
|
||||
# This is needed for backward compatiblity
|
||||
if isinstance(routers[0], dict):
|
||||
routers = [router['id'] for router in routers]
|
||||
self.updated_routers.update(routers)
|
||||
|
||||
def router_removed_from_agent(self, context, payload):
|
||||
self.router_deleted(context, payload['router_id'])
|
||||
LOG.debug(_('Got router removed from agent :%r'), payload)
|
||||
self.removed_routers.add(payload['router_id'])
|
||||
|
||||
def router_added_to_agent(self, context, payload):
|
||||
LOG.debug(_('Got router added to agent :%r'), payload)
|
||||
self.routers_updated(context, payload)
|
||||
|
||||
def _process_routers(self, routers, all_routers=False):
|
||||
pool = eventlet.GreenPool()
|
||||
if (self.conf.external_network_bridge and
|
||||
not ip_lib.device_exists(self.conf.external_network_bridge)):
|
||||
LOG.error(_("The external network bridge '%s' does not exist"),
|
||||
@ -652,23 +664,53 @@ class L3NATAgent(manager.Manager):
|
||||
self._router_added(r['id'], r)
|
||||
ri = self.router_info[r['id']]
|
||||
ri.router = r
|
||||
self.process_router(ri)
|
||||
pool.spawn_n(self.process_router, ri)
|
||||
# identify and remove routers that no longer exist
|
||||
for router_id in prev_router_ids - cur_router_ids:
|
||||
self._router_removed(router_id)
|
||||
pool.spawn_n(self._router_removed, router_id)
|
||||
pool.waitall()
|
||||
|
||||
@lockutils.synchronized('l3-agent', 'neutron-')
|
||||
def _rpc_loop(self):
|
||||
# _rpc_loop and _sync_routers_task will not be
|
||||
# executed in the same time because of lock.
|
||||
# so we can clear the value of updated_routers
|
||||
# and removed_routers
|
||||
try:
|
||||
if self.updated_routers:
|
||||
router_ids = list(self.updated_routers)
|
||||
self.updated_routers.clear()
|
||||
routers = self.plugin_rpc.get_routers(
|
||||
self.context, router_ids)
|
||||
self._process_routers(routers)
|
||||
self._process_router_delete()
|
||||
except Exception:
|
||||
LOG.exception(_("Failed synchronizing routers"))
|
||||
self.fullsync = True
|
||||
|
||||
def _process_router_delete(self):
|
||||
current_removed_routers = list(self.removed_routers)
|
||||
for router_id in current_removed_routers:
|
||||
self._router_removed(context, router_id)
|
||||
self.removed_routers.remove(router_id)
|
||||
|
||||
def _router_ids(self):
|
||||
if not self.conf.use_namespaces:
|
||||
return [self.conf.router_id]
|
||||
|
||||
@periodic_task.periodic_task
|
||||
@lockutils.synchronized('l3-agent', 'neutron-')
|
||||
def _sync_routers_task(self, context):
|
||||
# we need to sync with router deletion RPC message
|
||||
with self.sync_sem:
|
||||
if self.fullsync:
|
||||
if not self.fullsync:
|
||||
return
|
||||
try:
|
||||
if not self.conf.use_namespaces:
|
||||
router_id = self.conf.router_id
|
||||
else:
|
||||
router_id = None
|
||||
router_ids = self._router_ids()
|
||||
self.updated_routers.clear()
|
||||
self.removed_routers.clear()
|
||||
routers = self.plugin_rpc.get_routers(
|
||||
context, router_id)
|
||||
context, router_ids)
|
||||
|
||||
LOG.debug(_('Processing :%r'), routers)
|
||||
self._process_routers(routers, all_routers=True)
|
||||
self.fullsync = False
|
||||
except Exception:
|
||||
|
@ -42,14 +42,14 @@ class L3AgentNotifyAPI(proxy.RpcProxy):
|
||||
payload=payload),
|
||||
topic='%s.%s' % (topics.L3_AGENT, host))
|
||||
|
||||
def _agent_notification(self, context, method, routers,
|
||||
def _agent_notification(self, context, method, router_ids,
|
||||
operation, data):
|
||||
"""Notify changed routers to hosting l3 agents."""
|
||||
adminContext = context.is_admin and context or context.elevated()
|
||||
plugin = manager.NeutronManager.get_plugin()
|
||||
for router in routers:
|
||||
for router_id in router_ids:
|
||||
l3_agents = plugin.get_l3_agents_hosting_routers(
|
||||
adminContext, [router['id']],
|
||||
adminContext, [router_id],
|
||||
admin_state_up=True,
|
||||
active=True)
|
||||
for l3_agent in l3_agents:
|
||||
@ -60,23 +60,24 @@ class L3AgentNotifyAPI(proxy.RpcProxy):
|
||||
'method': method})
|
||||
self.cast(
|
||||
context, self.make_msg(method,
|
||||
routers=[router]),
|
||||
topic='%s.%s' % (l3_agent.topic, l3_agent.host))
|
||||
routers=[router_id]),
|
||||
topic='%s.%s' % (l3_agent.topic, l3_agent.host),
|
||||
version='1.1')
|
||||
|
||||
def _notification(self, context, method, routers, operation, data):
|
||||
def _notification(self, context, method, router_ids, operation, data):
|
||||
"""Notify all the agents that are hosting the routers."""
|
||||
plugin = manager.NeutronManager.get_plugin()
|
||||
if utils.is_extension_supported(
|
||||
plugin, constants.L3_AGENT_SCHEDULER_EXT_ALIAS):
|
||||
adminContext = (context.is_admin and
|
||||
context or context.elevated())
|
||||
plugin.schedule_routers(adminContext, routers)
|
||||
plugin.schedule_routers(adminContext, router_ids)
|
||||
self._agent_notification(
|
||||
context, method, routers, operation, data)
|
||||
context, method, router_ids, operation, data)
|
||||
else:
|
||||
self.fanout_cast(
|
||||
context, self.make_msg(method,
|
||||
routers=routers),
|
||||
routers=router_ids),
|
||||
topic=topics.L3_AGENT)
|
||||
|
||||
def _notification_fanout(self, context, method, router_id):
|
||||
@ -99,17 +100,17 @@ class L3AgentNotifyAPI(proxy.RpcProxy):
|
||||
def router_deleted(self, context, router_id):
|
||||
self._notification_fanout(context, 'router_deleted', router_id)
|
||||
|
||||
def routers_updated(self, context, routers, operation=None, data=None):
|
||||
if routers:
|
||||
self._notification(context, 'routers_updated', routers,
|
||||
def routers_updated(self, context, router_ids, operation=None, data=None):
|
||||
if router_ids:
|
||||
self._notification(context, 'routers_updated', router_ids,
|
||||
operation, data)
|
||||
|
||||
def router_removed_from_agent(self, context, router_id, host):
|
||||
self._notification_host(context, 'router_removed_from_agent',
|
||||
{'router_id': router_id}, host)
|
||||
|
||||
def router_added_to_agent(self, context, routers, host):
|
||||
def router_added_to_agent(self, context, router_ids, host):
|
||||
self._notification_host(context, 'router_added_to_agent',
|
||||
routers, host)
|
||||
router_ids, host)
|
||||
|
||||
L3AgentNotify = L3AgentNotifyAPI()
|
||||
|
@ -129,9 +129,8 @@ class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase,
|
||||
router_id=router_id, agent_id=id)
|
||||
|
||||
if self.l3_agent_notifier:
|
||||
routers = self.get_sync_data(context, [router_id])
|
||||
self.l3_agent_notifier.router_added_to_agent(
|
||||
context, routers, agent_db.host)
|
||||
context, [router_id], agent_db.host)
|
||||
|
||||
def remove_router_from_l3_agent(self, context, id, router_id):
|
||||
"""Remove the router from l3 agent.
|
||||
|
@ -183,9 +183,8 @@ class L3_NAT_db_mixin(l3.RouterPluginBase):
|
||||
# Ensure we actually have something to update
|
||||
if r.keys():
|
||||
router_db.update(r)
|
||||
routers = self.get_sync_data(context.elevated(),
|
||||
[router_db['id']])
|
||||
l3_rpc_agent_api.L3AgentNotify.routers_updated(context, routers)
|
||||
l3_rpc_agent_api.L3AgentNotify.routers_updated(
|
||||
context, [router_db['id']])
|
||||
return self._make_router_dict(router_db)
|
||||
|
||||
def _create_router_gw_port(self, context, router, network_id):
|
||||
@ -336,6 +335,8 @@ class L3_NAT_db_mixin(l3.RouterPluginBase):
|
||||
raise q_exc.BadRequest(resource='router', msg=msg)
|
||||
|
||||
if 'port_id' in interface_info:
|
||||
# make sure port update is committed
|
||||
with context.session.begin(subtransactions=True):
|
||||
if 'subnet_id' in interface_info:
|
||||
msg = _("Cannot specify both subnet-id and port-id")
|
||||
raise q_exc.BadRequest(resource='router', msg=msg)
|
||||
@ -381,11 +382,8 @@ class L3_NAT_db_mixin(l3.RouterPluginBase):
|
||||
'device_owner': DEVICE_OWNER_ROUTER_INTF,
|
||||
'name': ''}})
|
||||
|
||||
routers = self.get_sync_data(context.elevated(), [router_id])
|
||||
l3_rpc_agent_api.L3AgentNotify.routers_updated(
|
||||
context, routers, 'add_router_interface',
|
||||
{'network_id': port['network_id'],
|
||||
'subnet_id': subnet_id})
|
||||
context, [router_id], 'add_router_interface')
|
||||
info = {'id': router_id,
|
||||
'tenant_id': subnet['tenant_id'],
|
||||
'port_id': port['id'],
|
||||
@ -428,7 +426,6 @@ class L3_NAT_db_mixin(l3.RouterPluginBase):
|
||||
subnet = self._get_subnet(context, subnet_id)
|
||||
self._confirm_router_interface_not_in_use(
|
||||
context, router_id, subnet_id)
|
||||
_network_id = port_db['network_id']
|
||||
self.delete_port(context, port_db['id'], l3_port_check=False)
|
||||
elif 'subnet_id' in interface_info:
|
||||
subnet_id = interface_info['subnet_id']
|
||||
@ -448,7 +445,6 @@ class L3_NAT_db_mixin(l3.RouterPluginBase):
|
||||
for p in ports:
|
||||
if p['fixed_ips'][0]['subnet_id'] == subnet_id:
|
||||
port_id = p['id']
|
||||
_network_id = p['network_id']
|
||||
self.delete_port(context, p['id'], l3_port_check=False)
|
||||
found = True
|
||||
break
|
||||
@ -458,11 +454,8 @@ class L3_NAT_db_mixin(l3.RouterPluginBase):
|
||||
if not found:
|
||||
raise l3.RouterInterfaceNotFoundForSubnet(router_id=router_id,
|
||||
subnet_id=subnet_id)
|
||||
routers = self.get_sync_data(context.elevated(), [router_id])
|
||||
l3_rpc_agent_api.L3AgentNotify.routers_updated(
|
||||
context, routers, 'remove_router_interface',
|
||||
{'network_id': _network_id,
|
||||
'subnet_id': subnet_id})
|
||||
context, [router_id], 'remove_router_interface')
|
||||
info = {'id': router_id,
|
||||
'tenant_id': subnet['tenant_id'],
|
||||
'port_id': port_id,
|
||||
@ -674,8 +667,8 @@ class L3_NAT_db_mixin(l3.RouterPluginBase):
|
||||
|
||||
router_id = floatingip_db['router_id']
|
||||
if router_id:
|
||||
routers = self.get_sync_data(context.elevated(), [router_id])
|
||||
l3_rpc_agent_api.L3AgentNotify.routers_updated(context, routers,
|
||||
l3_rpc_agent_api.L3AgentNotify.routers_updated(
|
||||
context, [router_id],
|
||||
'create_floatingip')
|
||||
return self._make_floatingip_dict(floatingip_db)
|
||||
|
||||
@ -697,8 +690,7 @@ class L3_NAT_db_mixin(l3.RouterPluginBase):
|
||||
if router_id and router_id != before_router_id:
|
||||
router_ids.append(router_id)
|
||||
if router_ids:
|
||||
routers = self.get_sync_data(context.elevated(), router_ids)
|
||||
l3_rpc_agent_api.L3AgentNotify.routers_updated(context, routers,
|
||||
l3_rpc_agent_api.L3AgentNotify.routers_updated(context, router_ids,
|
||||
'update_floatingip')
|
||||
return self._make_floatingip_dict(floatingip_db)
|
||||
|
||||
@ -711,8 +703,8 @@ class L3_NAT_db_mixin(l3.RouterPluginBase):
|
||||
floatingip['floating_port_id'],
|
||||
l3_port_check=False)
|
||||
if router_id:
|
||||
routers = self.get_sync_data(context.elevated(), [router_id])
|
||||
l3_rpc_agent_api.L3AgentNotify.routers_updated(context, routers,
|
||||
l3_rpc_agent_api.L3AgentNotify.routers_updated(
|
||||
context, [router_id],
|
||||
'delete_floatingip')
|
||||
|
||||
def get_floatingip(self, context, id, fields=None):
|
||||
@ -782,8 +774,8 @@ class L3_NAT_db_mixin(l3.RouterPluginBase):
|
||||
raise Exception(_('Multiple floating IPs found for port %s')
|
||||
% port_id)
|
||||
if router_id:
|
||||
routers = self.get_sync_data(context.elevated(), [router_id])
|
||||
l3_rpc_agent_api.L3AgentNotify.routers_updated(context, routers)
|
||||
l3_rpc_agent_api.L3AgentNotify.routers_updated(
|
||||
context, [router_id])
|
||||
|
||||
def _network_is_external(self, context, net_id):
|
||||
try:
|
||||
|
@ -109,7 +109,7 @@ class ChanceScheduler(object):
|
||||
context.session.add(binding)
|
||||
return True
|
||||
|
||||
def schedule(self, plugin, context, sync_router):
|
||||
def schedule(self, plugin, context, router_id):
|
||||
"""Schedule the router to an active L3 agent if there
|
||||
is no enable L3 agent hosting it.
|
||||
"""
|
||||
@ -119,14 +119,15 @@ class ChanceScheduler(object):
|
||||
# timing problem. Non-active l3 agent can return to
|
||||
# active any time
|
||||
l3_agents = plugin.get_l3_agents_hosting_routers(
|
||||
context, [sync_router['id']], admin_state_up=True)
|
||||
context, [router_id], admin_state_up=True)
|
||||
if l3_agents:
|
||||
LOG.debug(_('Router %(router_id)s has already been hosted'
|
||||
' by L3 agent %(agent_id)s'),
|
||||
{'router_id': sync_router['id'],
|
||||
{'router_id': router_id,
|
||||
'agent_id': l3_agents[0]['id']})
|
||||
return
|
||||
|
||||
sync_router = plugin.get_router(context, router_id)
|
||||
active_l3_agents = plugin.get_l3_agents(context, active=True)
|
||||
if not active_l3_agents:
|
||||
LOG.warn(_('No active L3 agents'))
|
||||
|
@ -1070,8 +1070,7 @@ class OvsL3AgentNotifierTestCase(test_l3_plugin.L3NatTestCaseMixin,
|
||||
L3_HOSTA)
|
||||
self._add_router_to_l3_agent(hosta_id,
|
||||
router1['router']['id'])
|
||||
routers = plugin.get_sync_data(self.adminContext,
|
||||
[router1['router']['id']])
|
||||
routers = [router1['router']['id']]
|
||||
mock_l3.assert_called_with(
|
||||
mock.ANY,
|
||||
plugin.l3_agent_notifier.make_msg(
|
||||
|
@ -31,6 +31,7 @@ from neutron.tests import base
|
||||
|
||||
_uuid = uuidutils.generate_uuid
|
||||
HOSTNAME = 'myhost'
|
||||
FAKE_ID = _uuid()
|
||||
|
||||
|
||||
class TestBasicRouterOperations(base.BaseTestCase):
|
||||
@ -508,22 +509,29 @@ class TestBasicRouterOperations(base.BaseTestCase):
|
||||
agent._process_routers(routers)
|
||||
self.assertNotIn(routers[0]['id'], agent.router_info)
|
||||
|
||||
def testSingleLoopRouterRemoval(self):
|
||||
def test_router_deleted(self):
|
||||
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
|
||||
self.plugin_api.get_external_network_id.return_value = None
|
||||
routers = [
|
||||
{'id': _uuid(),
|
||||
'admin_state_up': True,
|
||||
'routes': [],
|
||||
'external_gateway_info': {}}]
|
||||
agent._process_routers(routers)
|
||||
agent.router_deleted(None, FAKE_ID)
|
||||
# verify that will set fullsync
|
||||
self.assertTrue(FAKE_ID in agent.removed_routers)
|
||||
|
||||
agent.router_deleted(None, routers[0]['id'])
|
||||
# verify that remove is called
|
||||
self.assertEqual(self.mock_ip.get_devices.call_count, 1)
|
||||
def test_routers_updated(self):
|
||||
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
|
||||
agent.routers_updated(None, [FAKE_ID])
|
||||
# verify that will set fullsync
|
||||
self.assertTrue(FAKE_ID in agent.updated_routers)
|
||||
|
||||
self.device_exists.assert_has_calls(
|
||||
[mock.call(self.conf.external_network_bridge)])
|
||||
def test_removed_from_agent(self):
|
||||
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
|
||||
agent.router_removed_from_agent(None, {'router_id': FAKE_ID})
|
||||
# verify that will set fullsync
|
||||
self.assertTrue(FAKE_ID in agent.removed_routers)
|
||||
|
||||
def test_added_to_agent(self):
|
||||
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
|
||||
agent.router_added_to_agent(None, [FAKE_ID])
|
||||
# verify that will set fullsync
|
||||
self.assertTrue(FAKE_ID in agent.updated_routers)
|
||||
|
||||
def testDestroyNamespace(self):
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user