Merge "Add request timeout handling for Mellanox Neutron Agent"

2013-12-07 10:05:57 +00:00 · 2013-12-07 10:05:57 +00:00 · 54a9f31872
commit 54a9f31872
parent d4972f8e99 a191a1cfa3
7 changed files with 237 additions and 3 deletions
--- a/etc/neutron/plugins/mlnx/mlnx_conf.ini
+++ b/etc/neutron/plugins/mlnx/mlnx_conf.ini
@ -34,12 +34,20 @@
 # vnic_type = mlnx_direct
 # (StrOpt) Eswitch daemon end point connection url
-# daemon_endpoint = 'tcp://127.0.0.1:5001'
+# daemon_endpoint = 'tcp://127.0.0.1:60001'
 # The number of milliseconds the agent will wait for
 # response on request to daemon
 # request_timeout = 3000
 # The number of retries the agent will send request
 # to daemon before giving up
 # retries = 3
 # The backoff rate multiplier for waiting period between retries
 # on request to daemon, i.e. value of 2 will double
 # the request timeout each retry
 # backoff_rate = 2
 [agent]
 # Agent's polling interval in seconds
--- a/neutron/plugins/mlnx/agent/eswitch_neutron_agent.py
+++ b/neutron/plugins/mlnx/agent/eswitch_neutron_agent.py
@ -392,6 +392,10 @@ class MlnxEswitchNeutronAgent(sg_rpc.SecurityGroupAgentRpcMixin):
                    # If treat devices fails - must resync with plugin
                    sync = self.process_network_ports(port_info)
                    ports = port_info['current']
            except exceptions.RequestTimeout:
                LOG.exception(_("Request timeout in agent event loop "
                                "eSwitchD is not responding - exiting..."))
                raise SystemExit(1)
            except Exception:
                LOG.exception(_("Error in agent event loop"))
                sync = True
--- a/neutron/plugins/mlnx/agent/utils.py
+++ b/neutron/plugins/mlnx/agent/utils.py
@ -19,6 +19,7 @@ import zmq
 from neutron.openstack.common import jsonutils
 from neutron.openstack.common import log as logging
 from neutron.plugins.mlnx.common.comm_utils import RetryDecorator
 from neutron.plugins.mlnx.common import exceptions
 LOG = logging.getLogger(__name__)
@ -42,6 +43,7 @@ class EswitchUtils(object):
            self.poller.register(self._conn, zmq.POLLIN)
        return self.__conn
    @RetryDecorator(exceptions.RequestTimeout)
    def send_msg(self, msg):
        self._conn.send(msg)
@ -55,7 +57,7 @@ class EswitchUtils(object):
            self._conn.close()
            self.poller.unregister(self._conn)
            self.__conn = None
-            raise exceptions.MlnxException(_("eSwitchD: Request timeout"))
+            raise exceptions.RequestTimeout()
    def parse_response_msg(self, recv_msg):
        msg = jsonutils.loads(recv_msg)
@ -69,7 +71,7 @@ class EswitchUtils(object):
        else:
            error_msg = _("Unknown operation status %s") % msg['status']
        LOG.error(error_msg)
-        raise exceptions.MlnxException(error_msg)
+        raise exceptions.OperationFailed(err_msg=error_msg)
    def get_attached_vnics(self):
        LOG.debug(_("get_attached_vnics"))
--- a/neutron/plugins/mlnx/common/comm_utils.py
+++ b/neutron/plugins/mlnx/common/comm_utils.py
@ -0,0 +1,66 @@
 # vim: tabstop=4 shiftwidth=4 softtabstop=4
 #
 # Copyright 2013 Mellanox Technologies, Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 # implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import time
 from oslo.config import cfg
 from neutron.openstack.common import log as logging
 from neutron.plugins.mlnx.common import config  # noqa
 LOG = logging.getLogger(__name__)
 class RetryDecorator(object):
    """Retry decorator reruns a method 'retries' times if an exception occurs.
    Decorator for retrying a method if exceptionToCheck exception occurs
    If method raises exception, retries 'retries' times with increasing
    back off period between calls with 'interval' multiplier
    :param exceptionToCheck: the exception to check
    :param interval: initial delay between retries in seconds
    :param retries: number of times to try before giving up
    :raises: exceptionToCheck
    """
    sleep_fn = time.sleep
    def __init__(self, exceptionToCheck,
                 interval=cfg.CONF.ESWITCH.request_timeout / 1000,
                 retries=cfg.CONF.ESWITCH.retries,
                 backoff_rate=cfg.CONF.ESWITCH.backoff_rate):
        self.exc = exceptionToCheck
        self.interval = interval
        self.retries = retries
        self.backoff_rate = backoff_rate
    def __call__(self, original_func):
        def decorated(*args, **kwargs):
            sleep_interval = self.interval
            num_of_iter = self.retries
            while num_of_iter > 0:
                try:
                    return original_func(*args, **kwargs)
                except self.exc:
                    LOG.debug(_("Request timeout - call again after "
                              "%s seconds"), sleep_interval)
                    RetryDecorator.sleep_fn(sleep_interval)
                    num_of_iter -= 1
                    sleep_interval *= self.backoff_rate
            return original_func(*args, **kwargs)
        return decorated
--- a/neutron/plugins/mlnx/common/config.py
+++ b/neutron/plugins/mlnx/common/config.py
@ -48,6 +48,13 @@ eswitch_opts = [
    cfg.IntOpt('request_timeout', default=3000,
               help=_("The number of milliseconds the agent will wait for "
                      "response on request to daemon.")),
    cfg.IntOpt('retries', default=3,
               help=_("The number of retries the agent will send request "
                      "to daemon before giving up")),
    cfg.IntOpt('backoff_rate', default=2,
               help=_("backoff rate multiplier for waiting period between "
                      "retries for request to daemon, i.e. value of 2 will "
                      " double the request timeout each retry")),
 ]
 agent_opts = [
--- a/neutron/plugins/mlnx/common/exceptions.py
+++ b/neutron/plugins/mlnx/common/exceptions.py
@ -20,3 +20,11 @@ from neutron.common import exceptions as qexc
 class MlnxException(qexc.NeutronException):
    message = _("Mlnx Exception: %(err_msg)s")
 class RequestTimeout(qexc.NeutronException):
    message = _("Request Timeout: no response from eSwitchD")
 class OperationFailed(qexc.NeutronException):
    message = _("Operation Failed: %(err_msg)s")
--- a/neutron/tests/unit/mlnx/test_mlnx_comm_utils.py
+++ b/neutron/tests/unit/mlnx/test_mlnx_comm_utils.py
@ -0,0 +1,139 @@
 # Copyright (c) 2013 OpenStack Foundation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 # implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import mock
 from oslo.config import cfg
 from neutron.plugins.mlnx.common.comm_utils import RetryDecorator
 from neutron.plugins.mlnx.common import config  # noqa
 from neutron.plugins.mlnx.common import exceptions
 from neutron.tests import base
 class WrongException(Exception):
        pass
 class TestRetryDecorator(base.BaseTestCase):
    def setUp(self):
        super(TestRetryDecorator, self).setUp()
        self.sleep_fn_p = mock.patch.object(RetryDecorator, 'sleep_fn')
        self.sleep_fn = self.sleep_fn_p.start()
        self.addCleanup(self.sleep_fn_p.stop)
    def test_no_retry_required(self):
        self.counter = 0
        @RetryDecorator(exceptions.RequestTimeout, interval=2,
                        retries=3, backoff_rate=2)
        def succeeds():
            self.counter += 1
            return 'success'
        ret = succeeds()
        self.assertFalse(self.sleep_fn.called)
        self.assertEqual(ret, 'success')
        self.assertEqual(self.counter, 1)
    def test_retry_zero_times(self):
        self.counter = 0
        interval = 2
        backoff_rate = 2
        retries = 0
        @RetryDecorator(exceptions.RequestTimeout, interval,
                        retries, backoff_rate)
        def always_fails():
            self.counter += 1
            raise exceptions.RequestTimeout()
        self.assertRaises(exceptions.RequestTimeout, always_fails)
        self.assertEqual(self.counter, 1)
        self.assertFalse(self.sleep_fn.called)
    def test_retries_once(self):
        self.counter = 0
        interval = 2
        backoff_rate = 2
        retries = 3
        @RetryDecorator(exceptions.RequestTimeout, interval,
                        retries, backoff_rate)
        def fails_once():
            self.counter += 1
            if self.counter < 2:
                raise exceptions.RequestTimeout()
            else:
                return 'success'
        ret = fails_once()
        self.assertEqual(ret, 'success')
        self.assertEqual(self.counter, 2)
        self.assertEqual(self.sleep_fn.call_count, 1)
        self.sleep_fn.assert_called_with(interval)
    def test_limit_is_reached(self):
        self.counter = 0
        retries = 3
        interval = 2
        backoff_rate = 4
        @RetryDecorator(exceptions.RequestTimeout, interval,
                        retries, backoff_rate)
        def always_fails():
            self.counter += 1
            raise exceptions.RequestTimeout()
        self.assertRaises(exceptions.RequestTimeout, always_fails)
        self.assertEqual(self.counter, retries + 1)
        self.assertEqual(self.sleep_fn.call_count, retries)
        expected_sleep_fn_arg = []
        for i in range(retries):
            expected_sleep_fn_arg.append(interval)
            interval *= backoff_rate
        self.sleep_fn.assert_has_calls(map(mock.call, expected_sleep_fn_arg))
    def test_limit_is_reached_with_conf(self):
        self.counter = 0
        @RetryDecorator(exceptions.RequestTimeout)
        def always_fails():
            self.counter += 1
            raise exceptions.RequestTimeout()
        retry = cfg.CONF.ESWITCH.retries
        interval = cfg.CONF.ESWITCH.request_timeout / 1000
        delay_rate = cfg.CONF.ESWITCH.backoff_rate
        expected_sleep_fn_arg = []
        for i in range(retry):
            expected_sleep_fn_arg.append(interval)
            interval *= delay_rate
        self.assertRaises(exceptions.RequestTimeout, always_fails)
        self.assertEqual(self.counter, retry + 1)
        self.assertEqual(self.sleep_fn.call_count, retry)
        self.sleep_fn.assert_has_calls(map(mock.call, expected_sleep_fn_arg))
    def test_wrong_exception_no_retry(self):
        @RetryDecorator(exceptions.RequestTimeout)
        def raise_unexpected_error():
            raise WrongException("wrong exception")
        self.assertRaises(WrongException, raise_unexpected_error)
        self.assertFalse(self.sleep_fn.called)