Merge "L3 Agent restart causes network outage"

This commit is contained in:
Jenkins 2014-02-22 03:09:22 +00:00 committed by Gerrit Code Review
commit f52f70334a
2 changed files with 112 additions and 69 deletions

View File

@ -217,8 +217,9 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager):
self.updated_routers = set()
self.removed_routers = set()
self.sync_progress = False
if self.conf.use_namespaces:
self._destroy_router_namespaces(self.conf.router_id)
self._delete_stale_namespaces = (self.conf.use_namespaces and
self.conf.router_delete_namespaces)
self.rpc_loop = loopingcall.FixedIntervalLoopingCall(
self._rpc_loop)
@ -241,28 +242,46 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager):
LOG.error(msg)
raise SystemExit(msg)
def _destroy_router_namespaces(self, only_router_id=None):
"""Destroy router namespaces on the host to eliminate all stale
linux devices, iptables rules, and namespaces.
def _cleanup_namespaces(self, routers):
"""Destroy stale router namespaces on host when L3 agent restarts
If only_router_id is passed, only destroy single namespace, to allow
for multiple l3 agents on the same host, without stepping on each
other's toes on init. This only makes sense if only_router_id is set.
This routine is called when self._delete_stale_namespaces is True.
The argument routers is the list of routers that are recorded in
the database as being hosted on this node.
"""
try:
root_ip = ip_lib.IPWrapper(self.root_helper)
for ns in root_ip.get_namespaces(self.root_helper):
if ns.startswith(NS_PREFIX):
router_id = ns[len(NS_PREFIX):]
if only_router_id and not only_router_id == router_id:
continue
host_namespaces = root_ip.get_namespaces(self.root_helper)
router_namespaces = set(ns for ns in host_namespaces
if ns.startswith(NS_PREFIX))
ns_to_ignore = set(NS_PREFIX + r['id'] for r in routers)
ns_to_destroy = router_namespaces - ns_to_ignore
except RuntimeError:
LOG.exception(_('RuntimeError in obtaining router list '
'for namespace cleanup.'))
else:
self._destroy_stale_router_namespaces(ns_to_destroy)
def _destroy_stale_router_namespaces(self, router_namespaces):
"""Destroys the stale router namespaces
The argumenet router_namespaces is a list of stale router namespaces
As some stale router namespaces may not be able to be deleted, only
one attempt will be made to delete them.
"""
for ns in router_namespaces:
if self.conf.enable_metadata_proxy:
self._destroy_metadata_proxy(router_id, ns)
self._destroy_metadata_proxy(ns[len(NS_PREFIX):], ns)
try:
self._destroy_router_namespace(ns)
except Exception:
LOG.exception(_("Failed deleting namespace '%s'"), ns)
except RuntimeError:
LOG.exception(_('Failed to destroy stale router namespace '
'%s'), ns)
self._delete_stale_namespaces = False
def _destroy_router_namespace(self, namespace):
ns_ip = ip_lib.IPWrapper(self.root_helper, namespace=namespace)
@ -769,10 +788,19 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager):
self._process_routers(routers, all_routers=True)
self.fullsync = False
LOG.debug(_("_sync_routers_task successfully completed"))
except rpc_common.RPCException:
LOG.exception(_("Failed synchronizing routers due to RPC error"))
self.fullsync = True
return
except Exception:
LOG.exception(_("Failed synchronizing routers"))
self.fullsync = True
# Resync is not necessary for the cleanup of stale
# namespaces.
if self._delete_stale_namespaces:
self._cleanup_namespaces(routers)
def after_start(self):
LOG.info(_("L3 agent started"))

View File

@ -780,55 +780,6 @@ class TestBasicRouterOperations(base.BaseTestCase):
agent._process_router_delete()
self.assertFalse(list(agent.removed_routers))
def test_destroy_namespace(self):
class FakeDev(object):
def __init__(self, name):
self.name = name
self.mock_ip.get_namespaces.return_value = ['qrouter-foo',
'qrouter-bar']
self.mock_ip.get_devices.return_value = [FakeDev('qr-aaaa'),
FakeDev('qgw-aaaa')]
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
pm = self.external_process.return_value
pm.reset_mock()
agent._destroy_router_namespace = mock.MagicMock()
agent._destroy_router_namespaces()
self.assertEqual(pm.disable.call_count, 2)
self.assertEqual(agent._destroy_router_namespace.call_count, 2)
def test_destroy_namespace_with_router_id(self):
class FakeDev(object):
def __init__(self, name):
self.name = name
self.conf.router_id = _uuid()
namespaces = ['qrouter-foo', 'qrouter-' + self.conf.router_id]
self.mock_ip.get_namespaces.return_value = namespaces
self.mock_ip.get_devices.return_value = [FakeDev('qr-aaaa'),
FakeDev('qgw-aaaa')]
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
pm = self.external_process.return_value
pm.reset_mock()
agent._destroy_router_namespace = mock.MagicMock()
agent._destroy_router_namespaces(self.conf.router_id)
self.assertEqual(pm.disable.call_count, 1)
self.assertEqual(agent._destroy_router_namespace.call_count, 1)
def test_destroy_router_namespace_skips_ns_removal(self):
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
agent._destroy_router_namespace("fakens")
@ -890,6 +841,7 @@ class TestBasicRouterOperations(base.BaseTestCase):
self.conf.set_override('router_id', '1234')
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
self.assertEqual(['1234'], agent._router_ids())
self.assertFalse(agent._delete_stale_namespaces)
def test_process_routers_with_no_ext_net_in_conf(self):
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
@ -988,6 +940,69 @@ class TestBasicRouterOperations(base.BaseTestCase):
'-p tcp -m tcp --dport 8775 -j ACCEPT')
self.assertEqual([rules], agent.metadata_filter_rules())
def _cleanup_namespace_test(self,
stale_namespace_list,
router_list,
other_namespaces):
self.conf.set_override('router_delete_namespaces', True)
good_namespace_list = [l3_agent.NS_PREFIX + r['id']
for r in router_list]
self.mock_ip.get_namespaces.return_value = (stale_namespace_list +
good_namespace_list +
other_namespaces)
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
self.assertTrue(agent._delete_stale_namespaces)
pm = self.external_process.return_value
pm.reset_mock()
agent._destroy_router_namespace = mock.MagicMock()
agent._cleanup_namespaces(router_list)
self.assertEqual(pm.disable.call_count, len(stale_namespace_list))
self.assertEqual(agent._destroy_router_namespace.call_count,
len(stale_namespace_list))
expected_args = [mock.call(ns) for ns in stale_namespace_list]
agent._destroy_router_namespace.assert_has_calls(expected_args,
any_order=True)
self.assertFalse(agent._delete_stale_namespaces)
def test_cleanup_namespace(self):
self.conf.set_override('router_id', None)
stale_namespaces = [l3_agent.NS_PREFIX + 'foo',
l3_agent.NS_PREFIX + 'bar']
other_namespaces = ['unknown']
self._cleanup_namespace_test(stale_namespaces,
[],
other_namespaces)
def test_cleanup_namespace_with_registered_router_ids(self):
self.conf.set_override('router_id', None)
stale_namespaces = [l3_agent.NS_PREFIX + 'cccc',
l3_agent.NS_PREFIX + 'eeeee']
router_list = [{'id': 'foo'}, {'id': 'aaaa'}]
other_namespaces = ['qdhcp-aabbcc', 'unknown']
self._cleanup_namespace_test(stale_namespaces,
router_list,
other_namespaces)
def test_cleanup_namespace_with_conf_router_id(self):
self.conf.set_override('router_id', 'bbbbb')
stale_namespaces = [l3_agent.NS_PREFIX + 'cccc',
l3_agent.NS_PREFIX + 'eeeee',
l3_agent.NS_PREFIX + self.conf.router_id]
router_list = [{'id': 'foo'}, {'id': 'aaaa'}]
other_namespaces = ['qdhcp-aabbcc', 'unknown']
self._cleanup_namespace_test(stale_namespaces,
router_list,
other_namespaces)
class TestL3AgentEventHandler(base.BaseTestCase):