Merge "Add request timeout handling for Mellanox Neutron Agent"
This commit is contained in:
commit
54a9f31872
@ -34,12 +34,20 @@
|
|||||||
# vnic_type = mlnx_direct
|
# vnic_type = mlnx_direct
|
||||||
|
|
||||||
# (StrOpt) Eswitch daemon end point connection url
|
# (StrOpt) Eswitch daemon end point connection url
|
||||||
# daemon_endpoint = 'tcp://127.0.0.1:5001'
|
# daemon_endpoint = 'tcp://127.0.0.1:60001'
|
||||||
|
|
||||||
# The number of milliseconds the agent will wait for
|
# The number of milliseconds the agent will wait for
|
||||||
# response on request to daemon
|
# response on request to daemon
|
||||||
# request_timeout = 3000
|
# request_timeout = 3000
|
||||||
|
|
||||||
|
# The number of retries the agent will send request
|
||||||
|
# to daemon before giving up
|
||||||
|
# retries = 3
|
||||||
|
|
||||||
|
# The backoff rate multiplier for waiting period between retries
|
||||||
|
# on request to daemon, i.e. value of 2 will double
|
||||||
|
# the request timeout each retry
|
||||||
|
# backoff_rate = 2
|
||||||
|
|
||||||
[agent]
|
[agent]
|
||||||
# Agent's polling interval in seconds
|
# Agent's polling interval in seconds
|
||||||
|
@ -392,6 +392,10 @@ class MlnxEswitchNeutronAgent(sg_rpc.SecurityGroupAgentRpcMixin):
|
|||||||
# If treat devices fails - must resync with plugin
|
# If treat devices fails - must resync with plugin
|
||||||
sync = self.process_network_ports(port_info)
|
sync = self.process_network_ports(port_info)
|
||||||
ports = port_info['current']
|
ports = port_info['current']
|
||||||
|
except exceptions.RequestTimeout:
|
||||||
|
LOG.exception(_("Request timeout in agent event loop "
|
||||||
|
"eSwitchD is not responding - exiting..."))
|
||||||
|
raise SystemExit(1)
|
||||||
except Exception:
|
except Exception:
|
||||||
LOG.exception(_("Error in agent event loop"))
|
LOG.exception(_("Error in agent event loop"))
|
||||||
sync = True
|
sync = True
|
||||||
|
@ -19,6 +19,7 @@ import zmq
|
|||||||
|
|
||||||
from neutron.openstack.common import jsonutils
|
from neutron.openstack.common import jsonutils
|
||||||
from neutron.openstack.common import log as logging
|
from neutron.openstack.common import log as logging
|
||||||
|
from neutron.plugins.mlnx.common.comm_utils import RetryDecorator
|
||||||
from neutron.plugins.mlnx.common import exceptions
|
from neutron.plugins.mlnx.common import exceptions
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
@ -42,6 +43,7 @@ class EswitchUtils(object):
|
|||||||
self.poller.register(self._conn, zmq.POLLIN)
|
self.poller.register(self._conn, zmq.POLLIN)
|
||||||
return self.__conn
|
return self.__conn
|
||||||
|
|
||||||
|
@RetryDecorator(exceptions.RequestTimeout)
|
||||||
def send_msg(self, msg):
|
def send_msg(self, msg):
|
||||||
self._conn.send(msg)
|
self._conn.send(msg)
|
||||||
|
|
||||||
@ -55,7 +57,7 @@ class EswitchUtils(object):
|
|||||||
self._conn.close()
|
self._conn.close()
|
||||||
self.poller.unregister(self._conn)
|
self.poller.unregister(self._conn)
|
||||||
self.__conn = None
|
self.__conn = None
|
||||||
raise exceptions.MlnxException(_("eSwitchD: Request timeout"))
|
raise exceptions.RequestTimeout()
|
||||||
|
|
||||||
def parse_response_msg(self, recv_msg):
|
def parse_response_msg(self, recv_msg):
|
||||||
msg = jsonutils.loads(recv_msg)
|
msg = jsonutils.loads(recv_msg)
|
||||||
@ -69,7 +71,7 @@ class EswitchUtils(object):
|
|||||||
else:
|
else:
|
||||||
error_msg = _("Unknown operation status %s") % msg['status']
|
error_msg = _("Unknown operation status %s") % msg['status']
|
||||||
LOG.error(error_msg)
|
LOG.error(error_msg)
|
||||||
raise exceptions.MlnxException(error_msg)
|
raise exceptions.OperationFailed(err_msg=error_msg)
|
||||||
|
|
||||||
def get_attached_vnics(self):
|
def get_attached_vnics(self):
|
||||||
LOG.debug(_("get_attached_vnics"))
|
LOG.debug(_("get_attached_vnics"))
|
||||||
|
66
neutron/plugins/mlnx/common/comm_utils.py
Normal file
66
neutron/plugins/mlnx/common/comm_utils.py
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
# vim: tabstop=4 shiftwidth=4 softtabstop=4
|
||||||
|
#
|
||||||
|
# Copyright 2013 Mellanox Technologies, Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||||
|
# implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
|
from oslo.config import cfg
|
||||||
|
|
||||||
|
from neutron.openstack.common import log as logging
|
||||||
|
from neutron.plugins.mlnx.common import config # noqa
|
||||||
|
|
||||||
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class RetryDecorator(object):
|
||||||
|
"""Retry decorator reruns a method 'retries' times if an exception occurs.
|
||||||
|
|
||||||
|
Decorator for retrying a method if exceptionToCheck exception occurs
|
||||||
|
If method raises exception, retries 'retries' times with increasing
|
||||||
|
back off period between calls with 'interval' multiplier
|
||||||
|
|
||||||
|
:param exceptionToCheck: the exception to check
|
||||||
|
:param interval: initial delay between retries in seconds
|
||||||
|
:param retries: number of times to try before giving up
|
||||||
|
:raises: exceptionToCheck
|
||||||
|
"""
|
||||||
|
sleep_fn = time.sleep
|
||||||
|
|
||||||
|
def __init__(self, exceptionToCheck,
|
||||||
|
interval=cfg.CONF.ESWITCH.request_timeout / 1000,
|
||||||
|
retries=cfg.CONF.ESWITCH.retries,
|
||||||
|
backoff_rate=cfg.CONF.ESWITCH.backoff_rate):
|
||||||
|
self.exc = exceptionToCheck
|
||||||
|
self.interval = interval
|
||||||
|
self.retries = retries
|
||||||
|
self.backoff_rate = backoff_rate
|
||||||
|
|
||||||
|
def __call__(self, original_func):
|
||||||
|
def decorated(*args, **kwargs):
|
||||||
|
sleep_interval = self.interval
|
||||||
|
num_of_iter = self.retries
|
||||||
|
while num_of_iter > 0:
|
||||||
|
try:
|
||||||
|
return original_func(*args, **kwargs)
|
||||||
|
except self.exc:
|
||||||
|
LOG.debug(_("Request timeout - call again after "
|
||||||
|
"%s seconds"), sleep_interval)
|
||||||
|
RetryDecorator.sleep_fn(sleep_interval)
|
||||||
|
num_of_iter -= 1
|
||||||
|
sleep_interval *= self.backoff_rate
|
||||||
|
|
||||||
|
return original_func(*args, **kwargs)
|
||||||
|
return decorated
|
@ -48,6 +48,13 @@ eswitch_opts = [
|
|||||||
cfg.IntOpt('request_timeout', default=3000,
|
cfg.IntOpt('request_timeout', default=3000,
|
||||||
help=_("The number of milliseconds the agent will wait for "
|
help=_("The number of milliseconds the agent will wait for "
|
||||||
"response on request to daemon.")),
|
"response on request to daemon.")),
|
||||||
|
cfg.IntOpt('retries', default=3,
|
||||||
|
help=_("The number of retries the agent will send request "
|
||||||
|
"to daemon before giving up")),
|
||||||
|
cfg.IntOpt('backoff_rate', default=2,
|
||||||
|
help=_("backoff rate multiplier for waiting period between "
|
||||||
|
"retries for request to daemon, i.e. value of 2 will "
|
||||||
|
" double the request timeout each retry")),
|
||||||
]
|
]
|
||||||
|
|
||||||
agent_opts = [
|
agent_opts = [
|
||||||
|
@ -20,3 +20,11 @@ from neutron.common import exceptions as qexc
|
|||||||
|
|
||||||
class MlnxException(qexc.NeutronException):
|
class MlnxException(qexc.NeutronException):
|
||||||
message = _("Mlnx Exception: %(err_msg)s")
|
message = _("Mlnx Exception: %(err_msg)s")
|
||||||
|
|
||||||
|
|
||||||
|
class RequestTimeout(qexc.NeutronException):
|
||||||
|
message = _("Request Timeout: no response from eSwitchD")
|
||||||
|
|
||||||
|
|
||||||
|
class OperationFailed(qexc.NeutronException):
|
||||||
|
message = _("Operation Failed: %(err_msg)s")
|
||||||
|
139
neutron/tests/unit/mlnx/test_mlnx_comm_utils.py
Normal file
139
neutron/tests/unit/mlnx/test_mlnx_comm_utils.py
Normal file
@ -0,0 +1,139 @@
|
|||||||
|
# Copyright (c) 2013 OpenStack Foundation
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||||
|
# implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import mock
|
||||||
|
from oslo.config import cfg
|
||||||
|
|
||||||
|
from neutron.plugins.mlnx.common.comm_utils import RetryDecorator
|
||||||
|
from neutron.plugins.mlnx.common import config # noqa
|
||||||
|
from neutron.plugins.mlnx.common import exceptions
|
||||||
|
from neutron.tests import base
|
||||||
|
|
||||||
|
|
||||||
|
class WrongException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class TestRetryDecorator(base.BaseTestCase):
|
||||||
|
def setUp(self):
|
||||||
|
super(TestRetryDecorator, self).setUp()
|
||||||
|
self.sleep_fn_p = mock.patch.object(RetryDecorator, 'sleep_fn')
|
||||||
|
self.sleep_fn = self.sleep_fn_p.start()
|
||||||
|
self.addCleanup(self.sleep_fn_p.stop)
|
||||||
|
|
||||||
|
def test_no_retry_required(self):
|
||||||
|
self.counter = 0
|
||||||
|
|
||||||
|
@RetryDecorator(exceptions.RequestTimeout, interval=2,
|
||||||
|
retries=3, backoff_rate=2)
|
||||||
|
def succeeds():
|
||||||
|
self.counter += 1
|
||||||
|
return 'success'
|
||||||
|
|
||||||
|
ret = succeeds()
|
||||||
|
self.assertFalse(self.sleep_fn.called)
|
||||||
|
self.assertEqual(ret, 'success')
|
||||||
|
self.assertEqual(self.counter, 1)
|
||||||
|
|
||||||
|
def test_retry_zero_times(self):
|
||||||
|
self.counter = 0
|
||||||
|
interval = 2
|
||||||
|
backoff_rate = 2
|
||||||
|
retries = 0
|
||||||
|
|
||||||
|
@RetryDecorator(exceptions.RequestTimeout, interval,
|
||||||
|
retries, backoff_rate)
|
||||||
|
def always_fails():
|
||||||
|
self.counter += 1
|
||||||
|
raise exceptions.RequestTimeout()
|
||||||
|
|
||||||
|
self.assertRaises(exceptions.RequestTimeout, always_fails)
|
||||||
|
self.assertEqual(self.counter, 1)
|
||||||
|
self.assertFalse(self.sleep_fn.called)
|
||||||
|
|
||||||
|
def test_retries_once(self):
|
||||||
|
self.counter = 0
|
||||||
|
interval = 2
|
||||||
|
backoff_rate = 2
|
||||||
|
retries = 3
|
||||||
|
|
||||||
|
@RetryDecorator(exceptions.RequestTimeout, interval,
|
||||||
|
retries, backoff_rate)
|
||||||
|
def fails_once():
|
||||||
|
self.counter += 1
|
||||||
|
if self.counter < 2:
|
||||||
|
raise exceptions.RequestTimeout()
|
||||||
|
else:
|
||||||
|
return 'success'
|
||||||
|
|
||||||
|
ret = fails_once()
|
||||||
|
self.assertEqual(ret, 'success')
|
||||||
|
self.assertEqual(self.counter, 2)
|
||||||
|
self.assertEqual(self.sleep_fn.call_count, 1)
|
||||||
|
self.sleep_fn.assert_called_with(interval)
|
||||||
|
|
||||||
|
def test_limit_is_reached(self):
|
||||||
|
self.counter = 0
|
||||||
|
retries = 3
|
||||||
|
interval = 2
|
||||||
|
backoff_rate = 4
|
||||||
|
|
||||||
|
@RetryDecorator(exceptions.RequestTimeout, interval,
|
||||||
|
retries, backoff_rate)
|
||||||
|
def always_fails():
|
||||||
|
self.counter += 1
|
||||||
|
raise exceptions.RequestTimeout()
|
||||||
|
|
||||||
|
self.assertRaises(exceptions.RequestTimeout, always_fails)
|
||||||
|
self.assertEqual(self.counter, retries + 1)
|
||||||
|
self.assertEqual(self.sleep_fn.call_count, retries)
|
||||||
|
|
||||||
|
expected_sleep_fn_arg = []
|
||||||
|
for i in range(retries):
|
||||||
|
expected_sleep_fn_arg.append(interval)
|
||||||
|
interval *= backoff_rate
|
||||||
|
|
||||||
|
self.sleep_fn.assert_has_calls(map(mock.call, expected_sleep_fn_arg))
|
||||||
|
|
||||||
|
def test_limit_is_reached_with_conf(self):
|
||||||
|
self.counter = 0
|
||||||
|
|
||||||
|
@RetryDecorator(exceptions.RequestTimeout)
|
||||||
|
def always_fails():
|
||||||
|
self.counter += 1
|
||||||
|
raise exceptions.RequestTimeout()
|
||||||
|
|
||||||
|
retry = cfg.CONF.ESWITCH.retries
|
||||||
|
interval = cfg.CONF.ESWITCH.request_timeout / 1000
|
||||||
|
delay_rate = cfg.CONF.ESWITCH.backoff_rate
|
||||||
|
|
||||||
|
expected_sleep_fn_arg = []
|
||||||
|
for i in range(retry):
|
||||||
|
expected_sleep_fn_arg.append(interval)
|
||||||
|
interval *= delay_rate
|
||||||
|
|
||||||
|
self.assertRaises(exceptions.RequestTimeout, always_fails)
|
||||||
|
self.assertEqual(self.counter, retry + 1)
|
||||||
|
self.assertEqual(self.sleep_fn.call_count, retry)
|
||||||
|
self.sleep_fn.assert_has_calls(map(mock.call, expected_sleep_fn_arg))
|
||||||
|
|
||||||
|
def test_wrong_exception_no_retry(self):
|
||||||
|
|
||||||
|
@RetryDecorator(exceptions.RequestTimeout)
|
||||||
|
def raise_unexpected_error():
|
||||||
|
raise WrongException("wrong exception")
|
||||||
|
|
||||||
|
self.assertRaises(WrongException, raise_unexpected_error)
|
||||||
|
self.assertFalse(self.sleep_fn.called)
|
Loading…
Reference in New Issue
Block a user