Add idle state to driver providers
This change adds an idle state to driver providers which is used to indicate that the provider should stop performing actions that are not safe to perform while we bootstrap a second newer version of the provider to handle a config update. This is particularly interesting for the static driver because it is managing all of its state internally to nodepool and not relying on external cloud systems to track resources. This means it is important for the static provider to not have an old provider object update zookeeper at the same time as a new provider object. This was previously possible and created situtations where the resources in zookeeper did not reflect our local config. Since all other drivers rely on external state the primary update here is to the static driver. We simply stop performing config synchronization if the idle flag is set on a static provider. This will allow the new provider to take over reflecting the new config consistently. Note, we don't take other approaches and essentially create a system specific to the static driver because we're trying to avoid modifying the nodepool runtime significantly to fix a problem that is specific to the static driver. Change-Id: I93519d0c6f4ddf8a417d837f6ae12a30a55870bb
This commit is contained in:
parent
6cfda7de66
commit
2a231a08c9
@ -195,6 +195,17 @@ class Provider(ProviderNotifications):
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def idle(self):
|
||||
"""Idle the provider
|
||||
|
||||
This is called before stop(). Providers should use this as a signal
|
||||
to idle themselves and stop performing any actions that may interfere
|
||||
with a new version of this provider starting up.
|
||||
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def join(self):
|
||||
"""Wait for provider to finish
|
||||
|
@ -56,6 +56,9 @@ class KubernetesProvider(Provider, QuotaSupport):
|
||||
self.log.debug("Stopping")
|
||||
self.ready = False
|
||||
|
||||
def idle(self):
|
||||
pass
|
||||
|
||||
def listNodes(self):
|
||||
servers = []
|
||||
|
||||
|
@ -52,6 +52,9 @@ class OpenshiftProvider(Provider, QuotaSupport):
|
||||
def stop(self):
|
||||
self.log.debug("Stopping")
|
||||
|
||||
def idle(self):
|
||||
pass
|
||||
|
||||
def listNodes(self):
|
||||
servers = []
|
||||
|
||||
|
@ -69,6 +69,9 @@ class OpenStackProvider(Provider, QuotaSupport):
|
||||
self.running = False
|
||||
self._server_list_watcher_stop_event.set()
|
||||
|
||||
def idle(self):
|
||||
pass
|
||||
|
||||
def join(self):
|
||||
self._server_list_watcher.join()
|
||||
|
||||
|
@ -510,6 +510,9 @@ class StateMachineProvider(Provider, QuotaSupport):
|
||||
self.adapter.stop()
|
||||
self.log.debug("Stopped")
|
||||
|
||||
def idle(self):
|
||||
pass
|
||||
|
||||
def join(self):
|
||||
self.log.debug("Joining")
|
||||
if self.state_machine_thread:
|
||||
|
@ -59,6 +59,9 @@ class StaticNodeProvider(Provider, QuotaSupport):
|
||||
# multiple threads (e.g. cleanup and deleted node worker).
|
||||
self._register_lock = threading.Lock()
|
||||
self._node_slots = {} # nodeTuple -> [node]
|
||||
# Flag to indicates we need to stop processing state that could
|
||||
# interfere with a newer versions of ourselves running.
|
||||
self._idle = False
|
||||
|
||||
def _getSlot(self, node):
|
||||
return self._node_slots[nodeTuple(node)].index(node)
|
||||
@ -412,6 +415,9 @@ class StaticNodeProvider(Provider, QuotaSupport):
|
||||
def stop(self):
|
||||
self.log.debug("Stopping")
|
||||
|
||||
def idle(self):
|
||||
self._idle = True
|
||||
|
||||
def poolNodes(self):
|
||||
return {
|
||||
nodeTuple(n): n
|
||||
@ -437,6 +443,8 @@ class StaticNodeProvider(Provider, QuotaSupport):
|
||||
return True
|
||||
|
||||
def cleanupLeakedResources(self):
|
||||
if self._idle:
|
||||
return
|
||||
with self._register_lock:
|
||||
self.getRegisteredNodes()
|
||||
for pool in self.provider.pools.values():
|
||||
@ -458,6 +466,9 @@ class StaticNodeProvider(Provider, QuotaSupport):
|
||||
'''
|
||||
Re-register the deleted node.
|
||||
'''
|
||||
if self._idle:
|
||||
return
|
||||
|
||||
# It's possible a deleted node no longer exists in our config, so
|
||||
# don't bother to reregister.
|
||||
node_tuple = nodeTuple(node)
|
||||
|
@ -29,6 +29,9 @@ class TestProvider(Provider):
|
||||
def stop(self):
|
||||
pass
|
||||
|
||||
def idle(self):
|
||||
pass
|
||||
|
||||
def join(self):
|
||||
pass
|
||||
|
||||
|
@ -50,6 +50,9 @@ class ProviderManager(object):
|
||||
if old_config:
|
||||
oldmanager = old_config.provider_managers.get(p.name)
|
||||
if oldmanager and p != oldmanager.provider:
|
||||
# Signal that actions not safe to run on both the old and
|
||||
# new providers while we synchronize should cease to run.
|
||||
oldmanager.idle()
|
||||
stop_managers.append(oldmanager)
|
||||
oldmanager = None
|
||||
if oldmanager:
|
||||
|
Loading…
x
Reference in New Issue
Block a user