diff --git a/etc/neutron/plugins/mlnx/mlnx_conf.ini b/etc/neutron/plugins/mlnx/mlnx_conf.ini index c3e5cc88ea..8419479043 100644 --- a/etc/neutron/plugins/mlnx/mlnx_conf.ini +++ b/etc/neutron/plugins/mlnx/mlnx_conf.ini @@ -34,12 +34,20 @@ # vnic_type = mlnx_direct # (StrOpt) Eswitch daemon end point connection url -# daemon_endpoint = 'tcp://127.0.0.1:5001' +# daemon_endpoint = 'tcp://127.0.0.1:60001' # The number of milliseconds the agent will wait for # response on request to daemon # request_timeout = 3000 +# The number of retries the agent will send request +# to daemon before giving up +# retries = 3 + +# The backoff rate multiplier for waiting period between retries +# on request to daemon, i.e. value of 2 will double +# the request timeout each retry +# backoff_rate = 2 [agent] # Agent's polling interval in seconds diff --git a/neutron/plugins/mlnx/agent/eswitch_neutron_agent.py b/neutron/plugins/mlnx/agent/eswitch_neutron_agent.py index c82dd9f83b..ae3ce98b67 100644 --- a/neutron/plugins/mlnx/agent/eswitch_neutron_agent.py +++ b/neutron/plugins/mlnx/agent/eswitch_neutron_agent.py @@ -392,6 +392,10 @@ class MlnxEswitchNeutronAgent(sg_rpc.SecurityGroupAgentRpcMixin): # If treat devices fails - must resync with plugin sync = self.process_network_ports(port_info) ports = port_info['current'] + except exceptions.RequestTimeout: + LOG.exception(_("Request timeout in agent event loop " + "eSwitchD is not responding - exiting...")) + raise SystemExit(1) except Exception: LOG.exception(_("Error in agent event loop")) sync = True diff --git a/neutron/plugins/mlnx/agent/utils.py b/neutron/plugins/mlnx/agent/utils.py index de05455c0d..dd4ccf0af5 100644 --- a/neutron/plugins/mlnx/agent/utils.py +++ b/neutron/plugins/mlnx/agent/utils.py @@ -19,6 +19,7 @@ import zmq from neutron.openstack.common import jsonutils from neutron.openstack.common import log as logging +from neutron.plugins.mlnx.common.comm_utils import RetryDecorator from neutron.plugins.mlnx.common import exceptions LOG = logging.getLogger(__name__) @@ -42,6 +43,7 @@ class EswitchUtils(object): self.poller.register(self._conn, zmq.POLLIN) return self.__conn + @RetryDecorator(exceptions.RequestTimeout) def send_msg(self, msg): self._conn.send(msg) @@ -55,7 +57,7 @@ class EswitchUtils(object): self._conn.close() self.poller.unregister(self._conn) self.__conn = None - raise exceptions.MlnxException(_("eSwitchD: Request timeout")) + raise exceptions.RequestTimeout() def parse_response_msg(self, recv_msg): msg = jsonutils.loads(recv_msg) @@ -69,7 +71,7 @@ class EswitchUtils(object): else: error_msg = _("Unknown operation status %s") % msg['status'] LOG.error(error_msg) - raise exceptions.MlnxException(error_msg) + raise exceptions.OperationFailed(err_msg=error_msg) def get_attached_vnics(self): LOG.debug(_("get_attached_vnics")) diff --git a/neutron/plugins/mlnx/common/comm_utils.py b/neutron/plugins/mlnx/common/comm_utils.py new file mode 100644 index 0000000000..a1a0f4a8a5 --- /dev/null +++ b/neutron/plugins/mlnx/common/comm_utils.py @@ -0,0 +1,66 @@ +# vim: tabstop=4 shiftwidth=4 softtabstop=4 +# +# Copyright 2013 Mellanox Technologies, Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time + +from oslo.config import cfg + +from neutron.openstack.common import log as logging +from neutron.plugins.mlnx.common import config # noqa + +LOG = logging.getLogger(__name__) + + +class RetryDecorator(object): + """Retry decorator reruns a method 'retries' times if an exception occurs. + + Decorator for retrying a method if exceptionToCheck exception occurs + If method raises exception, retries 'retries' times with increasing + back off period between calls with 'interval' multiplier + + :param exceptionToCheck: the exception to check + :param interval: initial delay between retries in seconds + :param retries: number of times to try before giving up + :raises: exceptionToCheck + """ + sleep_fn = time.sleep + + def __init__(self, exceptionToCheck, + interval=cfg.CONF.ESWITCH.request_timeout / 1000, + retries=cfg.CONF.ESWITCH.retries, + backoff_rate=cfg.CONF.ESWITCH.backoff_rate): + self.exc = exceptionToCheck + self.interval = interval + self.retries = retries + self.backoff_rate = backoff_rate + + def __call__(self, original_func): + def decorated(*args, **kwargs): + sleep_interval = self.interval + num_of_iter = self.retries + while num_of_iter > 0: + try: + return original_func(*args, **kwargs) + except self.exc: + LOG.debug(_("Request timeout - call again after " + "%s seconds"), sleep_interval) + RetryDecorator.sleep_fn(sleep_interval) + num_of_iter -= 1 + sleep_interval *= self.backoff_rate + + return original_func(*args, **kwargs) + return decorated diff --git a/neutron/plugins/mlnx/common/config.py b/neutron/plugins/mlnx/common/config.py index adf868ea7e..f5115845bf 100644 --- a/neutron/plugins/mlnx/common/config.py +++ b/neutron/plugins/mlnx/common/config.py @@ -48,6 +48,13 @@ eswitch_opts = [ cfg.IntOpt('request_timeout', default=3000, help=_("The number of milliseconds the agent will wait for " "response on request to daemon.")), + cfg.IntOpt('retries', default=3, + help=_("The number of retries the agent will send request " + "to daemon before giving up")), + cfg.IntOpt('backoff_rate', default=2, + help=_("backoff rate multiplier for waiting period between " + "retries for request to daemon, i.e. value of 2 will " + " double the request timeout each retry")), ] agent_opts = [ diff --git a/neutron/plugins/mlnx/common/exceptions.py b/neutron/plugins/mlnx/common/exceptions.py index 54355a0011..6fd1682152 100644 --- a/neutron/plugins/mlnx/common/exceptions.py +++ b/neutron/plugins/mlnx/common/exceptions.py @@ -20,3 +20,11 @@ from neutron.common import exceptions as qexc class MlnxException(qexc.NeutronException): message = _("Mlnx Exception: %(err_msg)s") + + +class RequestTimeout(qexc.NeutronException): + message = _("Request Timeout: no response from eSwitchD") + + +class OperationFailed(qexc.NeutronException): + message = _("Operation Failed: %(err_msg)s") diff --git a/neutron/tests/unit/mlnx/test_mlnx_comm_utils.py b/neutron/tests/unit/mlnx/test_mlnx_comm_utils.py new file mode 100644 index 0000000000..00659e02c1 --- /dev/null +++ b/neutron/tests/unit/mlnx/test_mlnx_comm_utils.py @@ -0,0 +1,139 @@ +# Copyright (c) 2013 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import mock +from oslo.config import cfg + +from neutron.plugins.mlnx.common.comm_utils import RetryDecorator +from neutron.plugins.mlnx.common import config # noqa +from neutron.plugins.mlnx.common import exceptions +from neutron.tests import base + + +class WrongException(Exception): + pass + + +class TestRetryDecorator(base.BaseTestCase): + def setUp(self): + super(TestRetryDecorator, self).setUp() + self.sleep_fn_p = mock.patch.object(RetryDecorator, 'sleep_fn') + self.sleep_fn = self.sleep_fn_p.start() + self.addCleanup(self.sleep_fn_p.stop) + + def test_no_retry_required(self): + self.counter = 0 + + @RetryDecorator(exceptions.RequestTimeout, interval=2, + retries=3, backoff_rate=2) + def succeeds(): + self.counter += 1 + return 'success' + + ret = succeeds() + self.assertFalse(self.sleep_fn.called) + self.assertEqual(ret, 'success') + self.assertEqual(self.counter, 1) + + def test_retry_zero_times(self): + self.counter = 0 + interval = 2 + backoff_rate = 2 + retries = 0 + + @RetryDecorator(exceptions.RequestTimeout, interval, + retries, backoff_rate) + def always_fails(): + self.counter += 1 + raise exceptions.RequestTimeout() + + self.assertRaises(exceptions.RequestTimeout, always_fails) + self.assertEqual(self.counter, 1) + self.assertFalse(self.sleep_fn.called) + + def test_retries_once(self): + self.counter = 0 + interval = 2 + backoff_rate = 2 + retries = 3 + + @RetryDecorator(exceptions.RequestTimeout, interval, + retries, backoff_rate) + def fails_once(): + self.counter += 1 + if self.counter < 2: + raise exceptions.RequestTimeout() + else: + return 'success' + + ret = fails_once() + self.assertEqual(ret, 'success') + self.assertEqual(self.counter, 2) + self.assertEqual(self.sleep_fn.call_count, 1) + self.sleep_fn.assert_called_with(interval) + + def test_limit_is_reached(self): + self.counter = 0 + retries = 3 + interval = 2 + backoff_rate = 4 + + @RetryDecorator(exceptions.RequestTimeout, interval, + retries, backoff_rate) + def always_fails(): + self.counter += 1 + raise exceptions.RequestTimeout() + + self.assertRaises(exceptions.RequestTimeout, always_fails) + self.assertEqual(self.counter, retries + 1) + self.assertEqual(self.sleep_fn.call_count, retries) + + expected_sleep_fn_arg = [] + for i in range(retries): + expected_sleep_fn_arg.append(interval) + interval *= backoff_rate + + self.sleep_fn.assert_has_calls(map(mock.call, expected_sleep_fn_arg)) + + def test_limit_is_reached_with_conf(self): + self.counter = 0 + + @RetryDecorator(exceptions.RequestTimeout) + def always_fails(): + self.counter += 1 + raise exceptions.RequestTimeout() + + retry = cfg.CONF.ESWITCH.retries + interval = cfg.CONF.ESWITCH.request_timeout / 1000 + delay_rate = cfg.CONF.ESWITCH.backoff_rate + + expected_sleep_fn_arg = [] + for i in range(retry): + expected_sleep_fn_arg.append(interval) + interval *= delay_rate + + self.assertRaises(exceptions.RequestTimeout, always_fails) + self.assertEqual(self.counter, retry + 1) + self.assertEqual(self.sleep_fn.call_count, retry) + self.sleep_fn.assert_has_calls(map(mock.call, expected_sleep_fn_arg)) + + def test_wrong_exception_no_retry(self): + + @RetryDecorator(exceptions.RequestTimeout) + def raise_unexpected_error(): + raise WrongException("wrong exception") + + self.assertRaises(WrongException, raise_unexpected_error) + self.assertFalse(self.sleep_fn.called)