From e7acdfe91ae1e539fa89de4e161d06dde5ede427 Mon Sep 17 00:00:00 2001 From: Boden R Date: Wed, 3 Feb 2016 14:39:27 -0700 Subject: [PATCH] NSX-v3 update endpoint state only on timeout This patch removes the NSX v3 client cluster logic that forces a revalidate of all endpoints when endpoint selection only finds DOWN endpoints. The revalidate call can cause cascading backpressure under certain circumstances. Now DOWN endpoints are only returned to UP as part of the endpoint keepalive ping that is controlled via conn_idle_timeout config property. Thus, the default conn_idle_timeout is also decreased to 10s ensuring endpoint revalidation occurs (by default) on a fequent basis. backport: liberty Change-Id: I5423bce793892dd864353a23ca7c288b846a1ab6 Closes-Bug: #1541591 --- etc/nsx.ini | 2 +- vmware_nsx/common/config.py | 2 +- vmware_nsx/nsxlib/v3/cluster.py | 17 ++++------------- vmware_nsx/tests/unit/nsx_v3/test_plugin.py | 2 -- 4 files changed, 6 insertions(+), 17 deletions(-) diff --git a/etc/nsx.ini b/etc/nsx.ini index 5159d271bb..053720cadf 100644 --- a/etc/nsx.ini +++ b/etc/nsx.ini @@ -369,7 +369,7 @@ # The amount of time in seconds to wait before ensuring connectivity to # the NSX manager if no manager connection has been used. -# conn_idle_timeout = 60 +# conn_idle_timeout = 10 # UUID of the default tier0 router that will be used for connecting to # tier1 logical routers and configuring external networks diff --git a/vmware_nsx/common/config.py b/vmware_nsx/common/config.py index 6dd7989a21..9fd1772f28 100644 --- a/vmware_nsx/common/config.py +++ b/vmware_nsx/common/config.py @@ -232,7 +232,7 @@ nsx_v3_opts = [ help=_("Maximum concurrent connections to each NSX " "manager.")), cfg.IntOpt('conn_idle_timeout', - default=60, + default=10, help=_('Ensure connectivity to the NSX manager if a connection ' 'is not used within timeout seconds.')), cfg.IntOpt('redirects', diff --git a/vmware_nsx/nsxlib/v3/cluster.py b/vmware_nsx/nsxlib/v3/cluster.py index ec52c3c6e2..988861b655 100644 --- a/vmware_nsx/nsxlib/v3/cluster.py +++ b/vmware_nsx/nsxlib/v3/cluster.py @@ -326,11 +326,6 @@ class ClusteredAPI(object): if up == len(self._endpoints) else ClusterHealth.ORANGE) - def revalidate_endpoints(self): - # validate each endpoint in serial - for endpoint in self._endpoints.values(): - self._validate(endpoint) - def _validate(self, endpoint): try: with endpoint.pool.item() as conn: @@ -343,7 +338,7 @@ class ClusteredAPI(object): "'%(ep)s' due to: %(err)s"), {'ep': endpoint, 'err': e}) - def _select_endpoint(self, revalidate=False): + def _select_endpoint(self): connected = {} for provider_id, endpoint in self._endpoints.items(): if endpoint.state == EndpointState.UP: @@ -352,12 +347,6 @@ class ClusteredAPI(object): # connection can be used now return endpoint - if not connected and revalidate: - LOG.debug("All endpoints DOWN; revalidating.") - # endpoints may have become available, try to revalidate - self.revalidate_endpoints() - return self._select_endpoint(revalidate=False) - # no free connections; randomly select a connected endpoint # which will likely wait on pool.item() until a connection frees up return (connected[random.choice(connected.keys())] @@ -382,8 +371,10 @@ class ClusteredAPI(object): @contextlib.contextmanager def endpoint_connection(self): - endpoint = self._select_endpoint(revalidate=True) + endpoint = self._select_endpoint() if not endpoint: + # all endpoints are DOWN and will have their next + # state updated as per _endpoint_keepalive() raise nsx_exc.ServiceClusterUnavailable( cluster_id=self.cluster_id) diff --git a/vmware_nsx/tests/unit/nsx_v3/test_plugin.py b/vmware_nsx/tests/unit/nsx_v3/test_plugin.py index 376b5de2f3..d42434d27e 100644 --- a/vmware_nsx/tests/unit/nsx_v3/test_plugin.py +++ b/vmware_nsx/tests/unit/nsx_v3/test_plugin.py @@ -71,8 +71,6 @@ class NsxV3PluginTestCaseMixin(test_plugin.NeutronDbPluginV2TestCase, self.cluster = nsx_cluster.NSXClusteredAPI( http_provider=nsxlib_testcase.MemoryMockAPIProvider(self.mock_api)) - self.cluster.revalidate_endpoints() - def _patch_object(*args, **kwargs): patcher = mock.patch.object(*args, **kwargs) patcher.start()