Merge "Add request timeout handling for Mellanox Neutron Agent"

2013-12-07 10:05:57 +00:00 · 2013-12-07 10:05:57 +00:00 · 54a9f31872
commit 54a9f31872
parent d4972f8e99 a191a1cfa3
7 changed files with 237 additions and 3 deletions
--- a/etc/neutron/plugins/mlnx/mlnx_conf.ini
+++ b/etc/neutron/plugins/mlnx/mlnx_conf.ini
@ -34,12 +34,20 @@
 # vnic_type = mlnx_direct

 # (StrOpt) Eswitch daemon end point connection url
-# daemon_endpoint = 'tcp://127.0.0.1:5001'
+# daemon_endpoint = 'tcp://127.0.0.1:60001'

 # The number of milliseconds the agent will wait for
 # response on request to daemon
 # request_timeout = 3000

+# The number of retries the agent will send request
+# to daemon before giving up
+# retries = 3
+
+# The backoff rate multiplier for waiting period between retries
+# on request to daemon, i.e. value of 2 will double
+# the request timeout each retry
+# backoff_rate = 2

 [agent]
 # Agent's polling interval in seconds
--- a/neutron/plugins/mlnx/agent/eswitch_neutron_agent.py
+++ b/neutron/plugins/mlnx/agent/eswitch_neutron_agent.py
@ -392,6 +392,10 @@ class MlnxEswitchNeutronAgent(sg_rpc.SecurityGroupAgentRpcMixin):
                    # If treat devices fails - must resync with plugin
                    sync = self.process_network_ports(port_info)
                    ports = port_info['current']
+            except exceptions.RequestTimeout:
+                LOG.exception(_("Request timeout in agent event loop "
+                                "eSwitchD is not responding - exiting..."))
+                raise SystemExit(1)
            except Exception:
                LOG.exception(_("Error in agent event loop"))
                sync = True
--- a/neutron/plugins/mlnx/agent/utils.py
+++ b/neutron/plugins/mlnx/agent/utils.py
@ -19,6 +19,7 @@ import zmq

 from neutron.openstack.common import jsonutils
 from neutron.openstack.common import log as logging
+from neutron.plugins.mlnx.common.comm_utils import RetryDecorator
 from neutron.plugins.mlnx.common import exceptions

 LOG = logging.getLogger(__name__)
@ -42,6 +43,7 @@ class EswitchUtils(object):
            self.poller.register(self._conn, zmq.POLLIN)
        return self.__conn

+    @RetryDecorator(exceptions.RequestTimeout)
    def send_msg(self, msg):
        self._conn.send(msg)

@ -55,7 +57,7 @@ class EswitchUtils(object):
            self._conn.close()
            self.poller.unregister(self._conn)
            self.__conn = None
-            raise exceptions.MlnxException(_("eSwitchD: Request timeout"))
+            raise exceptions.RequestTimeout()

    def parse_response_msg(self, recv_msg):
        msg = jsonutils.loads(recv_msg)
@ -69,7 +71,7 @@ class EswitchUtils(object):
        else:
            error_msg = _("Unknown operation status %s") % msg['status']
        LOG.error(error_msg)
-        raise exceptions.MlnxException(error_msg)
+        raise exceptions.OperationFailed(err_msg=error_msg)

    def get_attached_vnics(self):
        LOG.debug(_("get_attached_vnics"))
--- a/neutron/plugins/mlnx/common/comm_utils.py
+++ b/neutron/plugins/mlnx/common/comm_utils.py
@ -0,0 +1,66 @@
+# vim: tabstop=4 shiftwidth=4 softtabstop=4
+#
+# Copyright 2013 Mellanox Technologies, Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+from oslo.config import cfg
+
+from neutron.openstack.common import log as logging
+from neutron.plugins.mlnx.common import config  # noqa
+
+LOG = logging.getLogger(__name__)
+
+
+class RetryDecorator(object):
+    """Retry decorator reruns a method 'retries' times if an exception occurs.
+
+    Decorator for retrying a method if exceptionToCheck exception occurs
+    If method raises exception, retries 'retries' times with increasing
+    back off period between calls with 'interval' multiplier
+
+    :param exceptionToCheck: the exception to check
+    :param interval: initial delay between retries in seconds
+    :param retries: number of times to try before giving up
+    :raises: exceptionToCheck
+    """
+    sleep_fn = time.sleep
+
+    def __init__(self, exceptionToCheck,
+                 interval=cfg.CONF.ESWITCH.request_timeout / 1000,
+                 retries=cfg.CONF.ESWITCH.retries,
+                 backoff_rate=cfg.CONF.ESWITCH.backoff_rate):
+        self.exc = exceptionToCheck
+        self.interval = interval
+        self.retries = retries
+        self.backoff_rate = backoff_rate
+
+    def __call__(self, original_func):
+        def decorated(*args, **kwargs):
+            sleep_interval = self.interval
+            num_of_iter = self.retries
+            while num_of_iter > 0:
+                try:
+                    return original_func(*args, **kwargs)
+                except self.exc:
+                    LOG.debug(_("Request timeout - call again after "
+                              "%s seconds"), sleep_interval)
+                    RetryDecorator.sleep_fn(sleep_interval)
+                    num_of_iter -= 1
+                    sleep_interval *= self.backoff_rate
+
+            return original_func(*args, **kwargs)
+        return decorated
--- a/neutron/plugins/mlnx/common/config.py
+++ b/neutron/plugins/mlnx/common/config.py
@ -48,6 +48,13 @@ eswitch_opts = [
    cfg.IntOpt('request_timeout', default=3000,
               help=_("The number of milliseconds the agent will wait for "
                      "response on request to daemon.")),
+    cfg.IntOpt('retries', default=3,
+               help=_("The number of retries the agent will send request "
+                      "to daemon before giving up")),
+    cfg.IntOpt('backoff_rate', default=2,
+               help=_("backoff rate multiplier for waiting period between "
+                      "retries for request to daemon, i.e. value of 2 will "
+                      " double the request timeout each retry")),
 ]

 agent_opts = [
--- a/neutron/plugins/mlnx/common/exceptions.py
+++ b/neutron/plugins/mlnx/common/exceptions.py
@ -20,3 +20,11 @@ from neutron.common import exceptions as qexc

 class MlnxException(qexc.NeutronException):
    message = _("Mlnx Exception: %(err_msg)s")
+
+
+class RequestTimeout(qexc.NeutronException):
+    message = _("Request Timeout: no response from eSwitchD")
+
+
+class OperationFailed(qexc.NeutronException):
+    message = _("Operation Failed: %(err_msg)s")
--- a/neutron/tests/unit/mlnx/test_mlnx_comm_utils.py
+++ b/neutron/tests/unit/mlnx/test_mlnx_comm_utils.py
@ -0,0 +1,139 @@
+# Copyright (c) 2013 OpenStack Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import mock
+from oslo.config import cfg
+
+from neutron.plugins.mlnx.common.comm_utils import RetryDecorator
+from neutron.plugins.mlnx.common import config  # noqa
+from neutron.plugins.mlnx.common import exceptions
+from neutron.tests import base
+
+
+class WrongException(Exception):
+        pass
+
+
+class TestRetryDecorator(base.BaseTestCase):
+    def setUp(self):
+        super(TestRetryDecorator, self).setUp()
+        self.sleep_fn_p = mock.patch.object(RetryDecorator, 'sleep_fn')
+        self.sleep_fn = self.sleep_fn_p.start()
+        self.addCleanup(self.sleep_fn_p.stop)
+
+    def test_no_retry_required(self):
+        self.counter = 0
+
+        @RetryDecorator(exceptions.RequestTimeout, interval=2,
+                        retries=3, backoff_rate=2)
+        def succeeds():
+            self.counter += 1
+            return 'success'
+
+        ret = succeeds()
+        self.assertFalse(self.sleep_fn.called)
+        self.assertEqual(ret, 'success')
+        self.assertEqual(self.counter, 1)
+
+    def test_retry_zero_times(self):
+        self.counter = 0
+        interval = 2
+        backoff_rate = 2
+        retries = 0
+
+        @RetryDecorator(exceptions.RequestTimeout, interval,
+                        retries, backoff_rate)
+        def always_fails():
+            self.counter += 1
+            raise exceptions.RequestTimeout()
+
+        self.assertRaises(exceptions.RequestTimeout, always_fails)
+        self.assertEqual(self.counter, 1)
+        self.assertFalse(self.sleep_fn.called)
+
+    def test_retries_once(self):
+        self.counter = 0
+        interval = 2
+        backoff_rate = 2
+        retries = 3
+
+        @RetryDecorator(exceptions.RequestTimeout, interval,
+                        retries, backoff_rate)
+        def fails_once():
+            self.counter += 1
+            if self.counter < 2:
+                raise exceptions.RequestTimeout()
+            else:
+                return 'success'
+
+        ret = fails_once()
+        self.assertEqual(ret, 'success')
+        self.assertEqual(self.counter, 2)
+        self.assertEqual(self.sleep_fn.call_count, 1)
+        self.sleep_fn.assert_called_with(interval)
+
+    def test_limit_is_reached(self):
+        self.counter = 0
+        retries = 3
+        interval = 2
+        backoff_rate = 4
+
+        @RetryDecorator(exceptions.RequestTimeout, interval,
+                        retries, backoff_rate)
+        def always_fails():
+            self.counter += 1
+            raise exceptions.RequestTimeout()
+
+        self.assertRaises(exceptions.RequestTimeout, always_fails)
+        self.assertEqual(self.counter, retries + 1)
+        self.assertEqual(self.sleep_fn.call_count, retries)
+
+        expected_sleep_fn_arg = []
+        for i in range(retries):
+            expected_sleep_fn_arg.append(interval)
+            interval *= backoff_rate
+
+        self.sleep_fn.assert_has_calls(map(mock.call, expected_sleep_fn_arg))
+
+    def test_limit_is_reached_with_conf(self):
+        self.counter = 0
+
+        @RetryDecorator(exceptions.RequestTimeout)
+        def always_fails():
+            self.counter += 1
+            raise exceptions.RequestTimeout()
+
+        retry = cfg.CONF.ESWITCH.retries
+        interval = cfg.CONF.ESWITCH.request_timeout / 1000
+        delay_rate = cfg.CONF.ESWITCH.backoff_rate
+
+        expected_sleep_fn_arg = []
+        for i in range(retry):
+            expected_sleep_fn_arg.append(interval)
+            interval *= delay_rate
+
+        self.assertRaises(exceptions.RequestTimeout, always_fails)
+        self.assertEqual(self.counter, retry + 1)
+        self.assertEqual(self.sleep_fn.call_count, retry)
+        self.sleep_fn.assert_has_calls(map(mock.call, expected_sleep_fn_arg))
+
+    def test_wrong_exception_no_retry(self):
+
+        @RetryDecorator(exceptions.RequestTimeout)
+        def raise_unexpected_error():
+            raise WrongException("wrong exception")
+
+        self.assertRaises(WrongException, raise_unexpected_error)
+        self.assertFalse(self.sleep_fn.called)