L3 Agent restart causes network outage

When a L3 agent controlling multiple qrouter namespaces
restarts, it destroys all qrouter namespaces even if
some of them are still in use.  As a result, network
traffic could be stopped on the VMs that use the
networks associated with these namespaces.

So what is needed is for the L3 agent to preserve those
qrouter namespaces a L3 agent instance recognizes and to
destroy those it does not know about.

Closes-Bug: #1175695

Change-Id: Idae77886bd195d773878c3d212ccfd56269216fb
This commit is contained in:
Stephen Ma 2013-05-28 18:52:27 -07:00
parent a2220fad2c
commit 5dd1798903
2 changed files with 112 additions and 69 deletions

View File

@ -216,8 +216,9 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager):
self.updated_routers = set() self.updated_routers = set()
self.removed_routers = set() self.removed_routers = set()
self.sync_progress = False self.sync_progress = False
if self.conf.use_namespaces:
self._destroy_router_namespaces(self.conf.router_id) self._delete_stale_namespaces = (self.conf.use_namespaces and
self.conf.router_delete_namespaces)
self.rpc_loop = loopingcall.FixedIntervalLoopingCall( self.rpc_loop = loopingcall.FixedIntervalLoopingCall(
self._rpc_loop) self._rpc_loop)
@ -240,28 +241,46 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager):
LOG.error(msg) LOG.error(msg)
raise SystemExit(msg) raise SystemExit(msg)
def _destroy_router_namespaces(self, only_router_id=None): def _cleanup_namespaces(self, routers):
"""Destroy router namespaces on the host to eliminate all stale """Destroy stale router namespaces on host when L3 agent restarts
linux devices, iptables rules, and namespaces.
If only_router_id is passed, only destroy single namespace, to allow This routine is called when self._delete_stale_namespaces is True.
for multiple l3 agents on the same host, without stepping on each
other's toes on init. This only makes sense if only_router_id is set. The argument routers is the list of routers that are recorded in
the database as being hosted on this node.
""" """
root_ip = ip_lib.IPWrapper(self.root_helper) try:
for ns in root_ip.get_namespaces(self.root_helper): root_ip = ip_lib.IPWrapper(self.root_helper)
if ns.startswith(NS_PREFIX):
router_id = ns[len(NS_PREFIX):]
if only_router_id and not only_router_id == router_id:
continue
if self.conf.enable_metadata_proxy: host_namespaces = root_ip.get_namespaces(self.root_helper)
self._destroy_metadata_proxy(router_id, ns) router_namespaces = set(ns for ns in host_namespaces
if ns.startswith(NS_PREFIX))
ns_to_ignore = set(NS_PREFIX + r['id'] for r in routers)
ns_to_destroy = router_namespaces - ns_to_ignore
except RuntimeError:
LOG.exception(_('RuntimeError in obtaining router list '
'for namespace cleanup.'))
else:
self._destroy_stale_router_namespaces(ns_to_destroy)
try: def _destroy_stale_router_namespaces(self, router_namespaces):
self._destroy_router_namespace(ns) """Destroys the stale router namespaces
except Exception:
LOG.exception(_("Failed deleting namespace '%s'"), ns) The argumenet router_namespaces is a list of stale router namespaces
As some stale router namespaces may not be able to be deleted, only
one attempt will be made to delete them.
"""
for ns in router_namespaces:
if self.conf.enable_metadata_proxy:
self._destroy_metadata_proxy(ns[len(NS_PREFIX):], ns)
try:
self._destroy_router_namespace(ns)
except RuntimeError:
LOG.exception(_('Failed to destroy stale router namespace '
'%s'), ns)
self._delete_stale_namespaces = False
def _destroy_router_namespace(self, namespace): def _destroy_router_namespace(self, namespace):
ns_ip = ip_lib.IPWrapper(self.root_helper, namespace=namespace) ns_ip = ip_lib.IPWrapper(self.root_helper, namespace=namespace)
@ -759,10 +778,19 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager):
self._process_routers(routers, all_routers=True) self._process_routers(routers, all_routers=True)
self.fullsync = False self.fullsync = False
LOG.debug(_("_sync_routers_task successfully completed")) LOG.debug(_("_sync_routers_task successfully completed"))
except rpc_common.RPCException:
LOG.exception(_("Failed synchronizing routers due to RPC error"))
self.fullsync = True
return
except Exception: except Exception:
LOG.exception(_("Failed synchronizing routers")) LOG.exception(_("Failed synchronizing routers"))
self.fullsync = True self.fullsync = True
# Resync is not necessary for the cleanup of stale
# namespaces.
if self._delete_stale_namespaces:
self._cleanup_namespaces(routers)
def after_start(self): def after_start(self):
LOG.info(_("L3 agent started")) LOG.info(_("L3 agent started"))

View File

@ -732,55 +732,6 @@ class TestBasicRouterOperations(base.BaseTestCase):
agent._process_router_delete() agent._process_router_delete()
self.assertFalse(list(agent.removed_routers)) self.assertFalse(list(agent.removed_routers))
def test_destroy_namespace(self):
class FakeDev(object):
def __init__(self, name):
self.name = name
self.mock_ip.get_namespaces.return_value = ['qrouter-foo',
'qrouter-bar']
self.mock_ip.get_devices.return_value = [FakeDev('qr-aaaa'),
FakeDev('qgw-aaaa')]
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
pm = self.external_process.return_value
pm.reset_mock()
agent._destroy_router_namespace = mock.MagicMock()
agent._destroy_router_namespaces()
self.assertEqual(pm.disable.call_count, 2)
self.assertEqual(agent._destroy_router_namespace.call_count, 2)
def test_destroy_namespace_with_router_id(self):
class FakeDev(object):
def __init__(self, name):
self.name = name
self.conf.router_id = _uuid()
namespaces = ['qrouter-foo', 'qrouter-' + self.conf.router_id]
self.mock_ip.get_namespaces.return_value = namespaces
self.mock_ip.get_devices.return_value = [FakeDev('qr-aaaa'),
FakeDev('qgw-aaaa')]
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
pm = self.external_process.return_value
pm.reset_mock()
agent._destroy_router_namespace = mock.MagicMock()
agent._destroy_router_namespaces(self.conf.router_id)
self.assertEqual(pm.disable.call_count, 1)
self.assertEqual(agent._destroy_router_namespace.call_count, 1)
def test_destroy_router_namespace_skips_ns_removal(self): def test_destroy_router_namespace_skips_ns_removal(self):
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf) agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
agent._destroy_router_namespace("fakens") agent._destroy_router_namespace("fakens")
@ -842,6 +793,7 @@ class TestBasicRouterOperations(base.BaseTestCase):
self.conf.set_override('router_id', '1234') self.conf.set_override('router_id', '1234')
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf) agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
self.assertEqual(['1234'], agent._router_ids()) self.assertEqual(['1234'], agent._router_ids())
self.assertFalse(agent._delete_stale_namespaces)
def test_process_routers_with_no_ext_net_in_conf(self): def test_process_routers_with_no_ext_net_in_conf(self):
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf) agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
@ -940,6 +892,69 @@ class TestBasicRouterOperations(base.BaseTestCase):
'-p tcp -m tcp --dport 8775 -j ACCEPT') '-p tcp -m tcp --dport 8775 -j ACCEPT')
self.assertEqual([rules], agent.metadata_filter_rules()) self.assertEqual([rules], agent.metadata_filter_rules())
def _cleanup_namespace_test(self,
stale_namespace_list,
router_list,
other_namespaces):
self.conf.set_override('router_delete_namespaces', True)
good_namespace_list = [l3_agent.NS_PREFIX + r['id']
for r in router_list]
self.mock_ip.get_namespaces.return_value = (stale_namespace_list +
good_namespace_list +
other_namespaces)
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
self.assertTrue(agent._delete_stale_namespaces)
pm = self.external_process.return_value
pm.reset_mock()
agent._destroy_router_namespace = mock.MagicMock()
agent._cleanup_namespaces(router_list)
self.assertEqual(pm.disable.call_count, len(stale_namespace_list))
self.assertEqual(agent._destroy_router_namespace.call_count,
len(stale_namespace_list))
expected_args = [mock.call(ns) for ns in stale_namespace_list]
agent._destroy_router_namespace.assert_has_calls(expected_args,
any_order=True)
self.assertFalse(agent._delete_stale_namespaces)
def test_cleanup_namespace(self):
self.conf.set_override('router_id', None)
stale_namespaces = [l3_agent.NS_PREFIX + 'foo',
l3_agent.NS_PREFIX + 'bar']
other_namespaces = ['unknown']
self._cleanup_namespace_test(stale_namespaces,
[],
other_namespaces)
def test_cleanup_namespace_with_registered_router_ids(self):
self.conf.set_override('router_id', None)
stale_namespaces = [l3_agent.NS_PREFIX + 'cccc',
l3_agent.NS_PREFIX + 'eeeee']
router_list = [{'id': 'foo'}, {'id': 'aaaa'}]
other_namespaces = ['qdhcp-aabbcc', 'unknown']
self._cleanup_namespace_test(stale_namespaces,
router_list,
other_namespaces)
def test_cleanup_namespace_with_conf_router_id(self):
self.conf.set_override('router_id', 'bbbbb')
stale_namespaces = [l3_agent.NS_PREFIX + 'cccc',
l3_agent.NS_PREFIX + 'eeeee',
l3_agent.NS_PREFIX + self.conf.router_id]
router_list = [{'id': 'foo'}, {'id': 'aaaa'}]
other_namespaces = ['qdhcp-aabbcc', 'unknown']
self._cleanup_namespace_test(stale_namespaces,
router_list,
other_namespaces)
class TestL3AgentEventHandler(base.BaseTestCase): class TestL3AgentEventHandler(base.BaseTestCase):