NSX-v3 update endpoint state only on timeout

This patch removes the NSX v3 client cluster logic that forces a revalidate of all endpoints when endpoint selection only finds DOWN endpoints. The revalidate call can cause cascading backpressure under certain circumstances. Now DOWN endpoints are only returned to UP as part of the endpoint keepalive ping that is controlled via conn_idle_timeout config property. Thus, the default conn_idle_timeout is also decreased to 10s ensuring endpoint revalidation occurs (by default) on a fequent basis. backport: liberty Change-Id: I5423bce793892dd864353a23ca7c288b846a1ab6 Closes-Bug: #1541591
2016-02-03 14:39:27 -07:00 · 2016-02-03 14:39:27 -07:00 · e7acdfe91a
commit e7acdfe91a
parent 772e43f576
4 changed files with 6 additions and 17 deletions
--- a/etc/nsx.ini
+++ b/etc/nsx.ini
@ -369,7 +369,7 @@

 # The amount of time in seconds to wait before ensuring connectivity to
 # the NSX manager if no manager connection has been used.
-# conn_idle_timeout = 60
+# conn_idle_timeout = 10

 # UUID of the default tier0 router that will be used for connecting to
 # tier1 logical routers and configuring external networks
--- a/vmware_nsx/common/config.py
+++ b/vmware_nsx/common/config.py
@ -232,7 +232,7 @@ nsx_v3_opts = [
               help=_("Maximum concurrent connections to each NSX "
                      "manager.")),
    cfg.IntOpt('conn_idle_timeout',
-               default=60,
+               default=10,
               help=_('Ensure connectivity to the NSX manager if a connection '
                      'is not used within timeout seconds.')),
    cfg.IntOpt('redirects',
--- a/vmware_nsx/nsxlib/v3/cluster.py
+++ b/vmware_nsx/nsxlib/v3/cluster.py
@ -326,11 +326,6 @@ class ClusteredAPI(object):
                if up == len(self._endpoints)
                else ClusterHealth.ORANGE)

-    def revalidate_endpoints(self):
-        # validate each endpoint in serial
-        for endpoint in self._endpoints.values():
-            self._validate(endpoint)
-
    def _validate(self, endpoint):
        try:
            with endpoint.pool.item() as conn:
@ -343,7 +338,7 @@ class ClusteredAPI(object):
                            "'%(ep)s' due to: %(err)s"),
                        {'ep': endpoint, 'err': e})

-    def _select_endpoint(self, revalidate=False):
+    def _select_endpoint(self):
        connected = {}
        for provider_id, endpoint in self._endpoints.items():
            if endpoint.state == EndpointState.UP:
@ -352,12 +347,6 @@ class ClusteredAPI(object):
                    # connection can be used now
                    return endpoint

-        if not connected and revalidate:
-            LOG.debug("All endpoints DOWN; revalidating.")
-            # endpoints may have become available, try to revalidate
-            self.revalidate_endpoints()
-            return self._select_endpoint(revalidate=False)
-
        # no free connections; randomly select a connected endpoint
        # which will likely wait on pool.item() until a connection frees up
        return (connected[random.choice(connected.keys())]
@ -382,8 +371,10 @@ class ClusteredAPI(object):

    @contextlib.contextmanager
    def endpoint_connection(self):
-        endpoint = self._select_endpoint(revalidate=True)
+        endpoint = self._select_endpoint()
        if not endpoint:
+            # all endpoints are DOWN and will have their next
+            # state updated as per _endpoint_keepalive()
            raise nsx_exc.ServiceClusterUnavailable(
                cluster_id=self.cluster_id)

--- a/vmware_nsx/tests/unit/nsx_v3/test_plugin.py
+++ b/vmware_nsx/tests/unit/nsx_v3/test_plugin.py
@ -71,8 +71,6 @@ class NsxV3PluginTestCaseMixin(test_plugin.NeutronDbPluginV2TestCase,
        self.cluster = nsx_cluster.NSXClusteredAPI(
            http_provider=nsxlib_testcase.MemoryMockAPIProvider(self.mock_api))

-        self.cluster.revalidate_endpoints()
-
        def _patch_object(*args, **kwargs):
            patcher = mock.patch.object(*args, **kwargs)
            patcher.start()