Prevent downtime when client cert is regenerated

When client certificate is regenerated, keepalive connection
to NSX endpoint will be broken. This patch will detect this and
invoke a callback to give nsxlib user a chance to reload the cert;
then regenerate connection pool to restore connectivity.

Change-Id: I0a334df4dd05feb784b9ff8bdc988ac41878863c
This commit is contained in:
Anna Khmelnitsky 2017-01-31 19:25:45 -08:00 committed by garyk
parent 2ac012456d
commit 2b36887f5c
5 changed files with 57 additions and 5 deletions

View File

@ -94,6 +94,9 @@ class NsxLib(object):
def reinitialize_cluster(self, resource, event, trigger, **kwargs):
self.cluster._reinit_cluster()
def subscribe(self, callback, event):
self.cluster.subscribe(callback, event)
class NsxLibPortMirror(utils.NsxLibApiBase):

View File

@ -24,12 +24,27 @@ from vmware_nsxlib.v3 import utils
LOG = log.getLogger(__name__)
ERRORS = {requests.codes.NOT_FOUND: exceptions.ResourceNotFound,
requests.codes.PRECONDITION_FAILED: exceptions.StaleRevision}
DEFAULT_ERROR = exceptions.ManagerError
NULL_CURSOR_PREFIX = '0000'
def http_error_to_exception(status_code, error_code):
errors = {requests.codes.NOT_FOUND: exceptions.ResourceNotFound,
requests.codes.PRECONDITION_FAILED: exceptions.StaleRevision,
requests.codes.INTERNAL_SERVER_ERROR:
{'99': exceptions.ClientCertificateNotTrusted}}
if status_code in errors:
if isinstance(errors[status_code], dict):
# choose based on error code
if error_code in errors[status_code]:
return errors[status_code][error_code]
else:
return errors[status_code]
# default exception
return exceptions.ManagerError
class RESTClient(object):
_VERB_RESP_CODES = {
@ -96,7 +111,7 @@ class RESTClient(object):
def _raise_error(self, status_code, operation, result_msg,
error_code=None):
error = ERRORS.get(status_code, DEFAULT_ERROR)
error = http_error_to_exception(status_code, error_code)
raise error(manager='', operation=operation, details=result_msg,
error_code=error_code)
@ -223,7 +238,7 @@ class NSX3Client(JSONRESTClient):
def _raise_error(self, status_code, operation, result_msg,
error_code=None):
"""Override the Rest client errors to add the manager IPs"""
error = ERRORS.get(status_code, DEFAULT_ERROR)
error = http_error_to_exception(status_code, error_code)
raise error(manager=self.nsx_api_managers,
operation=operation,
details=result_msg,

View File

@ -33,6 +33,7 @@ from requests import exceptions as requests_exceptions
from vmware_nsxlib._i18n import _, _LI, _LW
from vmware_nsxlib.v3 import client as nsx_client
from vmware_nsxlib.v3 import exceptions
from vmware_nsxlib.v3 import nsx_constants
LOG = log.getLogger(__name__)
@ -206,6 +207,12 @@ class Endpoint(object):
self._state = EndpointState.INITIALIZED
self._last_updated = datetime.datetime.now()
def regenerate_pool(self):
self.pool = pools.Pool(min_size=self.pool.min_size,
max_size=self.pool.max_size,
order_as_stack=True,
create=self.pool.create)
@property
def last_updated(self):
return self._last_updated
@ -260,6 +267,7 @@ class ClusteredAPI(object):
self._http_provider = http_provider
self._keepalive_interval = keepalive_interval
self._callbacks = {}
def _init_cluster(*args, **kwargs):
self._init_endpoints(providers,
@ -358,11 +366,30 @@ class ClusteredAPI(object):
if up == len(self._endpoints)
else ClusterHealth.ORANGE)
def subscribe(self, callback, event):
if event in self._callbacks:
self._callbacks[event].append(callback)
else:
self._callbacks[event] = [callback]
def _notify(self, event):
if event in self._callbacks:
for callback in self._callbacks[event]:
callback()
def _validate(self, endpoint):
try:
with endpoint.pool.item() as conn:
self._http_provider.validate_connection(self, endpoint, conn)
endpoint.set_state(EndpointState.UP)
except exceptions.ClientCertificateNotTrusted:
LOG.warning(_LW("Failed to validate API cluster endpoint "
"'%(ep)s' due to untrusted client certificate"),
{'ep': endpoint})
# allow nsxlib user to reload certificate that possibly changed
self._notify(nsx_constants.ON_CLIENT_CERT_UNTRUSTED)
# regenerate connection pool based on new certificate
endpoint.regenerate_pool()
except Exception as e:
endpoint.set_state(EndpointState.DOWN)
LOG.warning(_LW("Failed to validate API cluster endpoint "

View File

@ -92,6 +92,10 @@ class StaleRevision(ManagerError):
pass
class ClientCertificateNotTrusted(ManagerError):
message = _("Certificate not trusted")
class ServiceClusterUnavailable(ManagerError):
message = _("Service cluster: '%(cluster_id)s' is unavailable. Please, "
"check NSX setup and/or configuration")

View File

@ -110,3 +110,6 @@ ERR_CODE_IPAM_IP_NOT_IN_POOL = 5110
ERR_CODE_IPAM_RANGE_MODIFY = 5602
ERR_CODE_IPAM_RANGE_DELETE = 5015
ERR_CODE_IPAM_RANGE_SHRUNK = 5016
# NsxLib events
ON_CLIENT_CERT_UNTRUSTED = 'on_client_cert_untrusted'