L3 Agent restart causes network outage
When a L3 agent controlling multiple qrouter namespaces restarts, it destroys all qrouter namespaces even if some of them are still in use. As a result, network traffic could be stopped on the VMs that use the networks associated with these namespaces. So what is needed is for the L3 agent to preserve those qrouter namespaces a L3 agent instance recognizes and to destroy those it does not know about. Closes-Bug: #1175695 Change-Id: Idae77886bd195d773878c3d212ccfd56269216fb
This commit is contained in:
parent
a2220fad2c
commit
5dd1798903
@ -216,8 +216,9 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager):
|
|||||||
self.updated_routers = set()
|
self.updated_routers = set()
|
||||||
self.removed_routers = set()
|
self.removed_routers = set()
|
||||||
self.sync_progress = False
|
self.sync_progress = False
|
||||||
if self.conf.use_namespaces:
|
|
||||||
self._destroy_router_namespaces(self.conf.router_id)
|
self._delete_stale_namespaces = (self.conf.use_namespaces and
|
||||||
|
self.conf.router_delete_namespaces)
|
||||||
|
|
||||||
self.rpc_loop = loopingcall.FixedIntervalLoopingCall(
|
self.rpc_loop = loopingcall.FixedIntervalLoopingCall(
|
||||||
self._rpc_loop)
|
self._rpc_loop)
|
||||||
@ -240,28 +241,46 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager):
|
|||||||
LOG.error(msg)
|
LOG.error(msg)
|
||||||
raise SystemExit(msg)
|
raise SystemExit(msg)
|
||||||
|
|
||||||
def _destroy_router_namespaces(self, only_router_id=None):
|
def _cleanup_namespaces(self, routers):
|
||||||
"""Destroy router namespaces on the host to eliminate all stale
|
"""Destroy stale router namespaces on host when L3 agent restarts
|
||||||
linux devices, iptables rules, and namespaces.
|
|
||||||
|
|
||||||
If only_router_id is passed, only destroy single namespace, to allow
|
This routine is called when self._delete_stale_namespaces is True.
|
||||||
for multiple l3 agents on the same host, without stepping on each
|
|
||||||
other's toes on init. This only makes sense if only_router_id is set.
|
The argument routers is the list of routers that are recorded in
|
||||||
|
the database as being hosted on this node.
|
||||||
"""
|
"""
|
||||||
root_ip = ip_lib.IPWrapper(self.root_helper)
|
try:
|
||||||
for ns in root_ip.get_namespaces(self.root_helper):
|
root_ip = ip_lib.IPWrapper(self.root_helper)
|
||||||
if ns.startswith(NS_PREFIX):
|
|
||||||
router_id = ns[len(NS_PREFIX):]
|
|
||||||
if only_router_id and not only_router_id == router_id:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if self.conf.enable_metadata_proxy:
|
host_namespaces = root_ip.get_namespaces(self.root_helper)
|
||||||
self._destroy_metadata_proxy(router_id, ns)
|
router_namespaces = set(ns for ns in host_namespaces
|
||||||
|
if ns.startswith(NS_PREFIX))
|
||||||
|
ns_to_ignore = set(NS_PREFIX + r['id'] for r in routers)
|
||||||
|
ns_to_destroy = router_namespaces - ns_to_ignore
|
||||||
|
except RuntimeError:
|
||||||
|
LOG.exception(_('RuntimeError in obtaining router list '
|
||||||
|
'for namespace cleanup.'))
|
||||||
|
else:
|
||||||
|
self._destroy_stale_router_namespaces(ns_to_destroy)
|
||||||
|
|
||||||
try:
|
def _destroy_stale_router_namespaces(self, router_namespaces):
|
||||||
self._destroy_router_namespace(ns)
|
"""Destroys the stale router namespaces
|
||||||
except Exception:
|
|
||||||
LOG.exception(_("Failed deleting namespace '%s'"), ns)
|
The argumenet router_namespaces is a list of stale router namespaces
|
||||||
|
|
||||||
|
As some stale router namespaces may not be able to be deleted, only
|
||||||
|
one attempt will be made to delete them.
|
||||||
|
"""
|
||||||
|
for ns in router_namespaces:
|
||||||
|
if self.conf.enable_metadata_proxy:
|
||||||
|
self._destroy_metadata_proxy(ns[len(NS_PREFIX):], ns)
|
||||||
|
|
||||||
|
try:
|
||||||
|
self._destroy_router_namespace(ns)
|
||||||
|
except RuntimeError:
|
||||||
|
LOG.exception(_('Failed to destroy stale router namespace '
|
||||||
|
'%s'), ns)
|
||||||
|
self._delete_stale_namespaces = False
|
||||||
|
|
||||||
def _destroy_router_namespace(self, namespace):
|
def _destroy_router_namespace(self, namespace):
|
||||||
ns_ip = ip_lib.IPWrapper(self.root_helper, namespace=namespace)
|
ns_ip = ip_lib.IPWrapper(self.root_helper, namespace=namespace)
|
||||||
@ -759,10 +778,19 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager):
|
|||||||
self._process_routers(routers, all_routers=True)
|
self._process_routers(routers, all_routers=True)
|
||||||
self.fullsync = False
|
self.fullsync = False
|
||||||
LOG.debug(_("_sync_routers_task successfully completed"))
|
LOG.debug(_("_sync_routers_task successfully completed"))
|
||||||
|
except rpc_common.RPCException:
|
||||||
|
LOG.exception(_("Failed synchronizing routers due to RPC error"))
|
||||||
|
self.fullsync = True
|
||||||
|
return
|
||||||
except Exception:
|
except Exception:
|
||||||
LOG.exception(_("Failed synchronizing routers"))
|
LOG.exception(_("Failed synchronizing routers"))
|
||||||
self.fullsync = True
|
self.fullsync = True
|
||||||
|
|
||||||
|
# Resync is not necessary for the cleanup of stale
|
||||||
|
# namespaces.
|
||||||
|
if self._delete_stale_namespaces:
|
||||||
|
self._cleanup_namespaces(routers)
|
||||||
|
|
||||||
def after_start(self):
|
def after_start(self):
|
||||||
LOG.info(_("L3 agent started"))
|
LOG.info(_("L3 agent started"))
|
||||||
|
|
||||||
|
@ -732,55 +732,6 @@ class TestBasicRouterOperations(base.BaseTestCase):
|
|||||||
agent._process_router_delete()
|
agent._process_router_delete()
|
||||||
self.assertFalse(list(agent.removed_routers))
|
self.assertFalse(list(agent.removed_routers))
|
||||||
|
|
||||||
def test_destroy_namespace(self):
|
|
||||||
|
|
||||||
class FakeDev(object):
|
|
||||||
def __init__(self, name):
|
|
||||||
self.name = name
|
|
||||||
|
|
||||||
self.mock_ip.get_namespaces.return_value = ['qrouter-foo',
|
|
||||||
'qrouter-bar']
|
|
||||||
self.mock_ip.get_devices.return_value = [FakeDev('qr-aaaa'),
|
|
||||||
FakeDev('qgw-aaaa')]
|
|
||||||
|
|
||||||
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
|
|
||||||
|
|
||||||
pm = self.external_process.return_value
|
|
||||||
pm.reset_mock()
|
|
||||||
|
|
||||||
agent._destroy_router_namespace = mock.MagicMock()
|
|
||||||
agent._destroy_router_namespaces()
|
|
||||||
|
|
||||||
self.assertEqual(pm.disable.call_count, 2)
|
|
||||||
|
|
||||||
self.assertEqual(agent._destroy_router_namespace.call_count, 2)
|
|
||||||
|
|
||||||
def test_destroy_namespace_with_router_id(self):
|
|
||||||
|
|
||||||
class FakeDev(object):
|
|
||||||
def __init__(self, name):
|
|
||||||
self.name = name
|
|
||||||
|
|
||||||
self.conf.router_id = _uuid()
|
|
||||||
|
|
||||||
namespaces = ['qrouter-foo', 'qrouter-' + self.conf.router_id]
|
|
||||||
|
|
||||||
self.mock_ip.get_namespaces.return_value = namespaces
|
|
||||||
self.mock_ip.get_devices.return_value = [FakeDev('qr-aaaa'),
|
|
||||||
FakeDev('qgw-aaaa')]
|
|
||||||
|
|
||||||
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
|
|
||||||
|
|
||||||
pm = self.external_process.return_value
|
|
||||||
pm.reset_mock()
|
|
||||||
|
|
||||||
agent._destroy_router_namespace = mock.MagicMock()
|
|
||||||
agent._destroy_router_namespaces(self.conf.router_id)
|
|
||||||
|
|
||||||
self.assertEqual(pm.disable.call_count, 1)
|
|
||||||
|
|
||||||
self.assertEqual(agent._destroy_router_namespace.call_count, 1)
|
|
||||||
|
|
||||||
def test_destroy_router_namespace_skips_ns_removal(self):
|
def test_destroy_router_namespace_skips_ns_removal(self):
|
||||||
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
|
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
|
||||||
agent._destroy_router_namespace("fakens")
|
agent._destroy_router_namespace("fakens")
|
||||||
@ -842,6 +793,7 @@ class TestBasicRouterOperations(base.BaseTestCase):
|
|||||||
self.conf.set_override('router_id', '1234')
|
self.conf.set_override('router_id', '1234')
|
||||||
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
|
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
|
||||||
self.assertEqual(['1234'], agent._router_ids())
|
self.assertEqual(['1234'], agent._router_ids())
|
||||||
|
self.assertFalse(agent._delete_stale_namespaces)
|
||||||
|
|
||||||
def test_process_routers_with_no_ext_net_in_conf(self):
|
def test_process_routers_with_no_ext_net_in_conf(self):
|
||||||
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
|
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
|
||||||
@ -940,6 +892,69 @@ class TestBasicRouterOperations(base.BaseTestCase):
|
|||||||
'-p tcp -m tcp --dport 8775 -j ACCEPT')
|
'-p tcp -m tcp --dport 8775 -j ACCEPT')
|
||||||
self.assertEqual([rules], agent.metadata_filter_rules())
|
self.assertEqual([rules], agent.metadata_filter_rules())
|
||||||
|
|
||||||
|
def _cleanup_namespace_test(self,
|
||||||
|
stale_namespace_list,
|
||||||
|
router_list,
|
||||||
|
other_namespaces):
|
||||||
|
self.conf.set_override('router_delete_namespaces', True)
|
||||||
|
|
||||||
|
good_namespace_list = [l3_agent.NS_PREFIX + r['id']
|
||||||
|
for r in router_list]
|
||||||
|
self.mock_ip.get_namespaces.return_value = (stale_namespace_list +
|
||||||
|
good_namespace_list +
|
||||||
|
other_namespaces)
|
||||||
|
|
||||||
|
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
|
||||||
|
|
||||||
|
self.assertTrue(agent._delete_stale_namespaces)
|
||||||
|
|
||||||
|
pm = self.external_process.return_value
|
||||||
|
pm.reset_mock()
|
||||||
|
|
||||||
|
agent._destroy_router_namespace = mock.MagicMock()
|
||||||
|
agent._cleanup_namespaces(router_list)
|
||||||
|
|
||||||
|
self.assertEqual(pm.disable.call_count, len(stale_namespace_list))
|
||||||
|
self.assertEqual(agent._destroy_router_namespace.call_count,
|
||||||
|
len(stale_namespace_list))
|
||||||
|
expected_args = [mock.call(ns) for ns in stale_namespace_list]
|
||||||
|
agent._destroy_router_namespace.assert_has_calls(expected_args,
|
||||||
|
any_order=True)
|
||||||
|
self.assertFalse(agent._delete_stale_namespaces)
|
||||||
|
|
||||||
|
def test_cleanup_namespace(self):
|
||||||
|
self.conf.set_override('router_id', None)
|
||||||
|
stale_namespaces = [l3_agent.NS_PREFIX + 'foo',
|
||||||
|
l3_agent.NS_PREFIX + 'bar']
|
||||||
|
other_namespaces = ['unknown']
|
||||||
|
|
||||||
|
self._cleanup_namespace_test(stale_namespaces,
|
||||||
|
[],
|
||||||
|
other_namespaces)
|
||||||
|
|
||||||
|
def test_cleanup_namespace_with_registered_router_ids(self):
|
||||||
|
self.conf.set_override('router_id', None)
|
||||||
|
stale_namespaces = [l3_agent.NS_PREFIX + 'cccc',
|
||||||
|
l3_agent.NS_PREFIX + 'eeeee']
|
||||||
|
router_list = [{'id': 'foo'}, {'id': 'aaaa'}]
|
||||||
|
other_namespaces = ['qdhcp-aabbcc', 'unknown']
|
||||||
|
|
||||||
|
self._cleanup_namespace_test(stale_namespaces,
|
||||||
|
router_list,
|
||||||
|
other_namespaces)
|
||||||
|
|
||||||
|
def test_cleanup_namespace_with_conf_router_id(self):
|
||||||
|
self.conf.set_override('router_id', 'bbbbb')
|
||||||
|
stale_namespaces = [l3_agent.NS_PREFIX + 'cccc',
|
||||||
|
l3_agent.NS_PREFIX + 'eeeee',
|
||||||
|
l3_agent.NS_PREFIX + self.conf.router_id]
|
||||||
|
router_list = [{'id': 'foo'}, {'id': 'aaaa'}]
|
||||||
|
other_namespaces = ['qdhcp-aabbcc', 'unknown']
|
||||||
|
|
||||||
|
self._cleanup_namespace_test(stale_namespaces,
|
||||||
|
router_list,
|
||||||
|
other_namespaces)
|
||||||
|
|
||||||
|
|
||||||
class TestL3AgentEventHandler(base.BaseTestCase):
|
class TestL3AgentEventHandler(base.BaseTestCase):
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user