From 2d69f32fb5561fef626358460fe420f106bfa060 Mon Sep 17 00:00:00 2001 From: Salvatore Orlando Date: Wed, 2 Oct 2013 12:14:14 -0700 Subject: [PATCH] Prevent L3 agent looping calls from hanging This patch adopts several measures to prevent _sync_routers_task and _rpc_loop from hanging because of subprocess.Popen.communicate not returning. 1) Perform a sleep everytime a command is completed, similarly to what is done in openstack.common.processutils.execute 2) Disable by default GARP, as kernel crashes caused by arping have been observed 3) Prevent a non-critical keyerror in _router_removed from triggering again a full sync, which might put the system under significant load. This patch also adds debug log statements aimed at improving the ability of debugging similar failures. Change-Id: I003316bce0f38b7d2ea7d563b5a0a58676834398 Partial-Bug: 1224001 --- neutron/agent/l3_agent.py | 21 ++++++++++++---- neutron/agent/linux/utils.py | 38 +++++++++++++++++------------ neutron/tests/unit/test_l3_agent.py | 1 + 3 files changed, 39 insertions(+), 21 deletions(-) diff --git a/neutron/agent/l3_agent.py b/neutron/agent/l3_agent.py index d6c54fc69f..b66d441d2e 100644 --- a/neutron/agent/l3_agent.py +++ b/neutron/agent/l3_agent.py @@ -163,10 +163,9 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager): help=_("TCP Port used by Neutron metadata namespace " "proxy.")), cfg.IntOpt('send_arp_for_ha', - default=3, - help=_("Send this many gratuitous ARPs for HA setup, " - "set it below or equal to 0 to disable this " - "feature.")), + default=0, + help=_("Send this many gratuitous ARPs for HA setup, if " + "less than or equal to 0, the feature is disabled")), cfg.BoolOpt('use_namespaces', default=True, help=_("Allow overlapping IP.")), cfg.StrOpt('router_id', default='', @@ -308,7 +307,11 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager): self._spawn_metadata_proxy(ri) def _router_removed(self, router_id): - ri = self.router_info[router_id] + ri = self.router_info.get(router_id) + if ri is None: + LOG.warn(_("Info for router %s were not found. " + "Skipping router removal"), router_id) + return ri.router['gw_port'] = None ri.router[l3_constants.INTERFACE_KEY] = [] ri.router[l3_constants.FLOATINGIP_KEY] = [] @@ -706,6 +709,8 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager): # so we can clear the value of updated_routers # and removed_routers try: + LOG.debug(_("Starting RPC loop for %d updated routers"), + len(self.updated_routers)) if self.updated_routers: router_ids = list(self.updated_routers) self.updated_routers.clear() @@ -713,6 +718,7 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager): self.context, router_ids) self._process_routers(routers) self._process_router_delete() + LOG.debug(_("RPC loop successfully completed")) except Exception: LOG.exception(_("Failed synchronizing routers")) self.fullsync = True @@ -732,6 +738,8 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager): def _sync_routers_task(self, context): if self.services_sync: super(L3NATAgent, self).process_services_sync(context) + LOG.debug(_("Starting _sync_routers_task - fullsync:%s"), + self.fullsync) if not self.fullsync: return try: @@ -744,6 +752,7 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager): LOG.debug(_('Processing :%r'), routers) self._process_routers(routers, all_routers=True) self.fullsync = False + LOG.debug(_("_sync_routers_task successfully completed")) except Exception: LOG.exception(_("Failed synchronizing routers")) self.fullsync = True @@ -809,6 +818,7 @@ class L3NATAgentWithStateReport(L3NATAgent): self.heartbeat.start(interval=report_interval) def _report_state(self): + LOG.debug(_("Report state task started")) num_ex_gw_ports = 0 num_interfaces = 0 num_floating_ips = 0 @@ -832,6 +842,7 @@ class L3NATAgentWithStateReport(L3NATAgent): self.use_call) self.agent_state.pop('start_flag', None) self.use_call = False + LOG.debug(_("Report state task successfully completed")) except AttributeError: # This means the server does not support report_state LOG.warn(_("Neutron server does not support state report." diff --git a/neutron/agent/linux/utils.py b/neutron/agent/linux/utils.py index fc9f4c9769..6e0aae41ef 100644 --- a/neutron/agent/linux/utils.py +++ b/neutron/agent/linux/utils.py @@ -25,6 +25,7 @@ import struct import tempfile from eventlet.green import subprocess +from eventlet import greenthread from neutron.common import utils from neutron.openstack.common import log as logging @@ -43,22 +44,27 @@ def execute(cmd, root_helper=None, process_input=None, addl_env=None, env = os.environ.copy() if addl_env: env.update(addl_env) - obj = utils.subprocess_popen(cmd, shell=False, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - env=env) - - _stdout, _stderr = (process_input and - obj.communicate(process_input) or - obj.communicate()) - obj.stdin.close() - m = _("\nCommand: %(cmd)s\nExit code: %(code)s\nStdout: %(stdout)r\n" - "Stderr: %(stderr)r") % {'cmd': cmd, 'code': obj.returncode, - 'stdout': _stdout, 'stderr': _stderr} - LOG.debug(m) - if obj.returncode and check_exit_code: - raise RuntimeError(m) + try: + obj = utils.subprocess_popen(cmd, shell=False, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=env) + _stdout, _stderr = (process_input and + obj.communicate(process_input) or + obj.communicate()) + obj.stdin.close() + m = _("\nCommand: %(cmd)s\nExit code: %(code)s\nStdout: %(stdout)r\n" + "Stderr: %(stderr)r") % {'cmd': cmd, 'code': obj.returncode, + 'stdout': _stdout, 'stderr': _stderr} + LOG.debug(m) + if obj.returncode and check_exit_code: + raise RuntimeError(m) + finally: + # NOTE(termie): this appears to be necessary to let the subprocess + # call clean something up in between calls, without + # it two execute calls in a row hangs the second one + greenthread.sleep(0) return return_stderr and (_stdout, _stderr) or _stdout diff --git a/neutron/tests/unit/test_l3_agent.py b/neutron/tests/unit/test_l3_agent.py index 3497a5ef89..86e43d6005 100644 --- a/neutron/tests/unit/test_l3_agent.py +++ b/neutron/tests/unit/test_l3_agent.py @@ -46,6 +46,7 @@ class TestBasicRouterOperations(base.BaseTestCase): self.conf.set_override('router_id', 'fake_id') self.conf.set_override('interface_driver', 'neutron.agent.linux.interface.NullDriver') + self.conf.set_override('send_arp_for_ha', 1) self.conf.root_helper = 'sudo' self.device_exists_p = mock.patch(