Merge "Add request timeout handling for Mellanox Neutron Agent"
This commit is contained in:
commit
54a9f31872
@ -34,12 +34,20 @@
|
||||
# vnic_type = mlnx_direct
|
||||
|
||||
# (StrOpt) Eswitch daemon end point connection url
|
||||
# daemon_endpoint = 'tcp://127.0.0.1:5001'
|
||||
# daemon_endpoint = 'tcp://127.0.0.1:60001'
|
||||
|
||||
# The number of milliseconds the agent will wait for
|
||||
# response on request to daemon
|
||||
# request_timeout = 3000
|
||||
|
||||
# The number of retries the agent will send request
|
||||
# to daemon before giving up
|
||||
# retries = 3
|
||||
|
||||
# The backoff rate multiplier for waiting period between retries
|
||||
# on request to daemon, i.e. value of 2 will double
|
||||
# the request timeout each retry
|
||||
# backoff_rate = 2
|
||||
|
||||
[agent]
|
||||
# Agent's polling interval in seconds
|
||||
|
@ -392,6 +392,10 @@ class MlnxEswitchNeutronAgent(sg_rpc.SecurityGroupAgentRpcMixin):
|
||||
# If treat devices fails - must resync with plugin
|
||||
sync = self.process_network_ports(port_info)
|
||||
ports = port_info['current']
|
||||
except exceptions.RequestTimeout:
|
||||
LOG.exception(_("Request timeout in agent event loop "
|
||||
"eSwitchD is not responding - exiting..."))
|
||||
raise SystemExit(1)
|
||||
except Exception:
|
||||
LOG.exception(_("Error in agent event loop"))
|
||||
sync = True
|
||||
|
@ -19,6 +19,7 @@ import zmq
|
||||
|
||||
from neutron.openstack.common import jsonutils
|
||||
from neutron.openstack.common import log as logging
|
||||
from neutron.plugins.mlnx.common.comm_utils import RetryDecorator
|
||||
from neutron.plugins.mlnx.common import exceptions
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
@ -42,6 +43,7 @@ class EswitchUtils(object):
|
||||
self.poller.register(self._conn, zmq.POLLIN)
|
||||
return self.__conn
|
||||
|
||||
@RetryDecorator(exceptions.RequestTimeout)
|
||||
def send_msg(self, msg):
|
||||
self._conn.send(msg)
|
||||
|
||||
@ -55,7 +57,7 @@ class EswitchUtils(object):
|
||||
self._conn.close()
|
||||
self.poller.unregister(self._conn)
|
||||
self.__conn = None
|
||||
raise exceptions.MlnxException(_("eSwitchD: Request timeout"))
|
||||
raise exceptions.RequestTimeout()
|
||||
|
||||
def parse_response_msg(self, recv_msg):
|
||||
msg = jsonutils.loads(recv_msg)
|
||||
@ -69,7 +71,7 @@ class EswitchUtils(object):
|
||||
else:
|
||||
error_msg = _("Unknown operation status %s") % msg['status']
|
||||
LOG.error(error_msg)
|
||||
raise exceptions.MlnxException(error_msg)
|
||||
raise exceptions.OperationFailed(err_msg=error_msg)
|
||||
|
||||
def get_attached_vnics(self):
|
||||
LOG.debug(_("get_attached_vnics"))
|
||||
|
66
neutron/plugins/mlnx/common/comm_utils.py
Normal file
66
neutron/plugins/mlnx/common/comm_utils.py
Normal file
@ -0,0 +1,66 @@
|
||||
# vim: tabstop=4 shiftwidth=4 softtabstop=4
|
||||
#
|
||||
# Copyright 2013 Mellanox Technologies, Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
# implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import time
|
||||
|
||||
from oslo.config import cfg
|
||||
|
||||
from neutron.openstack.common import log as logging
|
||||
from neutron.plugins.mlnx.common import config # noqa
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RetryDecorator(object):
|
||||
"""Retry decorator reruns a method 'retries' times if an exception occurs.
|
||||
|
||||
Decorator for retrying a method if exceptionToCheck exception occurs
|
||||
If method raises exception, retries 'retries' times with increasing
|
||||
back off period between calls with 'interval' multiplier
|
||||
|
||||
:param exceptionToCheck: the exception to check
|
||||
:param interval: initial delay between retries in seconds
|
||||
:param retries: number of times to try before giving up
|
||||
:raises: exceptionToCheck
|
||||
"""
|
||||
sleep_fn = time.sleep
|
||||
|
||||
def __init__(self, exceptionToCheck,
|
||||
interval=cfg.CONF.ESWITCH.request_timeout / 1000,
|
||||
retries=cfg.CONF.ESWITCH.retries,
|
||||
backoff_rate=cfg.CONF.ESWITCH.backoff_rate):
|
||||
self.exc = exceptionToCheck
|
||||
self.interval = interval
|
||||
self.retries = retries
|
||||
self.backoff_rate = backoff_rate
|
||||
|
||||
def __call__(self, original_func):
|
||||
def decorated(*args, **kwargs):
|
||||
sleep_interval = self.interval
|
||||
num_of_iter = self.retries
|
||||
while num_of_iter > 0:
|
||||
try:
|
||||
return original_func(*args, **kwargs)
|
||||
except self.exc:
|
||||
LOG.debug(_("Request timeout - call again after "
|
||||
"%s seconds"), sleep_interval)
|
||||
RetryDecorator.sleep_fn(sleep_interval)
|
||||
num_of_iter -= 1
|
||||
sleep_interval *= self.backoff_rate
|
||||
|
||||
return original_func(*args, **kwargs)
|
||||
return decorated
|
@ -48,6 +48,13 @@ eswitch_opts = [
|
||||
cfg.IntOpt('request_timeout', default=3000,
|
||||
help=_("The number of milliseconds the agent will wait for "
|
||||
"response on request to daemon.")),
|
||||
cfg.IntOpt('retries', default=3,
|
||||
help=_("The number of retries the agent will send request "
|
||||
"to daemon before giving up")),
|
||||
cfg.IntOpt('backoff_rate', default=2,
|
||||
help=_("backoff rate multiplier for waiting period between "
|
||||
"retries for request to daemon, i.e. value of 2 will "
|
||||
" double the request timeout each retry")),
|
||||
]
|
||||
|
||||
agent_opts = [
|
||||
|
@ -20,3 +20,11 @@ from neutron.common import exceptions as qexc
|
||||
|
||||
class MlnxException(qexc.NeutronException):
|
||||
message = _("Mlnx Exception: %(err_msg)s")
|
||||
|
||||
|
||||
class RequestTimeout(qexc.NeutronException):
|
||||
message = _("Request Timeout: no response from eSwitchD")
|
||||
|
||||
|
||||
class OperationFailed(qexc.NeutronException):
|
||||
message = _("Operation Failed: %(err_msg)s")
|
||||
|
139
neutron/tests/unit/mlnx/test_mlnx_comm_utils.py
Normal file
139
neutron/tests/unit/mlnx/test_mlnx_comm_utils.py
Normal file
@ -0,0 +1,139 @@
|
||||
# Copyright (c) 2013 OpenStack Foundation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
# implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import mock
|
||||
from oslo.config import cfg
|
||||
|
||||
from neutron.plugins.mlnx.common.comm_utils import RetryDecorator
|
||||
from neutron.plugins.mlnx.common import config # noqa
|
||||
from neutron.plugins.mlnx.common import exceptions
|
||||
from neutron.tests import base
|
||||
|
||||
|
||||
class WrongException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class TestRetryDecorator(base.BaseTestCase):
|
||||
def setUp(self):
|
||||
super(TestRetryDecorator, self).setUp()
|
||||
self.sleep_fn_p = mock.patch.object(RetryDecorator, 'sleep_fn')
|
||||
self.sleep_fn = self.sleep_fn_p.start()
|
||||
self.addCleanup(self.sleep_fn_p.stop)
|
||||
|
||||
def test_no_retry_required(self):
|
||||
self.counter = 0
|
||||
|
||||
@RetryDecorator(exceptions.RequestTimeout, interval=2,
|
||||
retries=3, backoff_rate=2)
|
||||
def succeeds():
|
||||
self.counter += 1
|
||||
return 'success'
|
||||
|
||||
ret = succeeds()
|
||||
self.assertFalse(self.sleep_fn.called)
|
||||
self.assertEqual(ret, 'success')
|
||||
self.assertEqual(self.counter, 1)
|
||||
|
||||
def test_retry_zero_times(self):
|
||||
self.counter = 0
|
||||
interval = 2
|
||||
backoff_rate = 2
|
||||
retries = 0
|
||||
|
||||
@RetryDecorator(exceptions.RequestTimeout, interval,
|
||||
retries, backoff_rate)
|
||||
def always_fails():
|
||||
self.counter += 1
|
||||
raise exceptions.RequestTimeout()
|
||||
|
||||
self.assertRaises(exceptions.RequestTimeout, always_fails)
|
||||
self.assertEqual(self.counter, 1)
|
||||
self.assertFalse(self.sleep_fn.called)
|
||||
|
||||
def test_retries_once(self):
|
||||
self.counter = 0
|
||||
interval = 2
|
||||
backoff_rate = 2
|
||||
retries = 3
|
||||
|
||||
@RetryDecorator(exceptions.RequestTimeout, interval,
|
||||
retries, backoff_rate)
|
||||
def fails_once():
|
||||
self.counter += 1
|
||||
if self.counter < 2:
|
||||
raise exceptions.RequestTimeout()
|
||||
else:
|
||||
return 'success'
|
||||
|
||||
ret = fails_once()
|
||||
self.assertEqual(ret, 'success')
|
||||
self.assertEqual(self.counter, 2)
|
||||
self.assertEqual(self.sleep_fn.call_count, 1)
|
||||
self.sleep_fn.assert_called_with(interval)
|
||||
|
||||
def test_limit_is_reached(self):
|
||||
self.counter = 0
|
||||
retries = 3
|
||||
interval = 2
|
||||
backoff_rate = 4
|
||||
|
||||
@RetryDecorator(exceptions.RequestTimeout, interval,
|
||||
retries, backoff_rate)
|
||||
def always_fails():
|
||||
self.counter += 1
|
||||
raise exceptions.RequestTimeout()
|
||||
|
||||
self.assertRaises(exceptions.RequestTimeout, always_fails)
|
||||
self.assertEqual(self.counter, retries + 1)
|
||||
self.assertEqual(self.sleep_fn.call_count, retries)
|
||||
|
||||
expected_sleep_fn_arg = []
|
||||
for i in range(retries):
|
||||
expected_sleep_fn_arg.append(interval)
|
||||
interval *= backoff_rate
|
||||
|
||||
self.sleep_fn.assert_has_calls(map(mock.call, expected_sleep_fn_arg))
|
||||
|
||||
def test_limit_is_reached_with_conf(self):
|
||||
self.counter = 0
|
||||
|
||||
@RetryDecorator(exceptions.RequestTimeout)
|
||||
def always_fails():
|
||||
self.counter += 1
|
||||
raise exceptions.RequestTimeout()
|
||||
|
||||
retry = cfg.CONF.ESWITCH.retries
|
||||
interval = cfg.CONF.ESWITCH.request_timeout / 1000
|
||||
delay_rate = cfg.CONF.ESWITCH.backoff_rate
|
||||
|
||||
expected_sleep_fn_arg = []
|
||||
for i in range(retry):
|
||||
expected_sleep_fn_arg.append(interval)
|
||||
interval *= delay_rate
|
||||
|
||||
self.assertRaises(exceptions.RequestTimeout, always_fails)
|
||||
self.assertEqual(self.counter, retry + 1)
|
||||
self.assertEqual(self.sleep_fn.call_count, retry)
|
||||
self.sleep_fn.assert_has_calls(map(mock.call, expected_sleep_fn_arg))
|
||||
|
||||
def test_wrong_exception_no_retry(self):
|
||||
|
||||
@RetryDecorator(exceptions.RequestTimeout)
|
||||
def raise_unexpected_error():
|
||||
raise WrongException("wrong exception")
|
||||
|
||||
self.assertRaises(WrongException, raise_unexpected_error)
|
||||
self.assertFalse(self.sleep_fn.called)
|
Loading…
Reference in New Issue
Block a user