From e7acdfe91ae1e539fa89de4e161d06dde5ede427 Mon Sep 17 00:00:00 2001
From: Boden R <bodenvmw@gmail.com>
Date: Wed, 3 Feb 2016 14:39:27 -0700
Subject: [PATCH] NSX-v3 update endpoint state only on timeout

This patch removes the NSX v3 client cluster logic that
forces a revalidate of all endpoints when endpoint
selection only finds DOWN endpoints. The revalidate
call can cause cascading backpressure under certain
circumstances.

Now DOWN endpoints are only returned to UP as part
of the endpoint keepalive ping that is controlled via
conn_idle_timeout config property. Thus, the default
conn_idle_timeout is also decreased to 10s ensuring
endpoint revalidation occurs (by default) on a fequent
basis.

backport: liberty

Change-Id: I5423bce793892dd864353a23ca7c288b846a1ab6
Closes-Bug: #1541591
---
 etc/nsx.ini                                 |  2 +-
 vmware_nsx/common/config.py                 |  2 +-
 vmware_nsx/nsxlib/v3/cluster.py             | 17 ++++-------------
 vmware_nsx/tests/unit/nsx_v3/test_plugin.py |  2 --
 4 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/etc/nsx.ini b/etc/nsx.ini
index 5159d271bb..053720cadf 100644
--- a/etc/nsx.ini
+++ b/etc/nsx.ini
@@ -369,7 +369,7 @@
 
 # The amount of time in seconds to wait before ensuring connectivity to
 # the NSX manager if no manager connection has been used.
-# conn_idle_timeout = 60
+# conn_idle_timeout = 10
 
 # UUID of the default tier0 router that will be used for connecting to
 # tier1 logical routers and configuring external networks
diff --git a/vmware_nsx/common/config.py b/vmware_nsx/common/config.py
index 6dd7989a21..9fd1772f28 100644
--- a/vmware_nsx/common/config.py
+++ b/vmware_nsx/common/config.py
@@ -232,7 +232,7 @@ nsx_v3_opts = [
                help=_("Maximum concurrent connections to each NSX "
                       "manager.")),
     cfg.IntOpt('conn_idle_timeout',
-               default=60,
+               default=10,
                help=_('Ensure connectivity to the NSX manager if a connection '
                       'is not used within timeout seconds.')),
     cfg.IntOpt('redirects',
diff --git a/vmware_nsx/nsxlib/v3/cluster.py b/vmware_nsx/nsxlib/v3/cluster.py
index ec52c3c6e2..988861b655 100644
--- a/vmware_nsx/nsxlib/v3/cluster.py
+++ b/vmware_nsx/nsxlib/v3/cluster.py
@@ -326,11 +326,6 @@ class ClusteredAPI(object):
                 if up == len(self._endpoints)
                 else ClusterHealth.ORANGE)
 
-    def revalidate_endpoints(self):
-        # validate each endpoint in serial
-        for endpoint in self._endpoints.values():
-            self._validate(endpoint)
-
     def _validate(self, endpoint):
         try:
             with endpoint.pool.item() as conn:
@@ -343,7 +338,7 @@ class ClusteredAPI(object):
                             "'%(ep)s' due to: %(err)s"),
                         {'ep': endpoint, 'err': e})
 
-    def _select_endpoint(self, revalidate=False):
+    def _select_endpoint(self):
         connected = {}
         for provider_id, endpoint in self._endpoints.items():
             if endpoint.state == EndpointState.UP:
@@ -352,12 +347,6 @@ class ClusteredAPI(object):
                     # connection can be used now
                     return endpoint
 
-        if not connected and revalidate:
-            LOG.debug("All endpoints DOWN; revalidating.")
-            # endpoints may have become available, try to revalidate
-            self.revalidate_endpoints()
-            return self._select_endpoint(revalidate=False)
-
         # no free connections; randomly select a connected endpoint
         # which will likely wait on pool.item() until a connection frees up
         return (connected[random.choice(connected.keys())]
@@ -382,8 +371,10 @@ class ClusteredAPI(object):
 
     @contextlib.contextmanager
     def endpoint_connection(self):
-        endpoint = self._select_endpoint(revalidate=True)
+        endpoint = self._select_endpoint()
         if not endpoint:
+            # all endpoints are DOWN and will have their next
+            # state updated as per _endpoint_keepalive()
             raise nsx_exc.ServiceClusterUnavailable(
                 cluster_id=self.cluster_id)
 
diff --git a/vmware_nsx/tests/unit/nsx_v3/test_plugin.py b/vmware_nsx/tests/unit/nsx_v3/test_plugin.py
index 376b5de2f3..d42434d27e 100644
--- a/vmware_nsx/tests/unit/nsx_v3/test_plugin.py
+++ b/vmware_nsx/tests/unit/nsx_v3/test_plugin.py
@@ -71,8 +71,6 @@ class NsxV3PluginTestCaseMixin(test_plugin.NeutronDbPluginV2TestCase,
         self.cluster = nsx_cluster.NSXClusteredAPI(
             http_provider=nsxlib_testcase.MemoryMockAPIProvider(self.mock_api))
 
-        self.cluster.revalidate_endpoints()
-
         def _patch_object(*args, **kwargs):
             patcher = mock.patch.object(*args, **kwargs)
             patcher.start()