Improve max-servers handling for GCE
The quota handling for the simple driver (used by GCE) only handles max-servers, but even so, it still didn't take into consideration currently building servers. If a number of simultaneous requests were received, it would try to build them all and eventually return node failures for the ones that the cloud refused to build. The OpenStack driver has a lot of nice quota handling methods which do take currently building nodes into account. This change moves some of those methods into a new Provider mixin class for quota support. This class implements some handy methods which perform the calculations and provides some abstract methods which providers will need to implement in order to supply information. The simple driver is updated to use this system, though it still only supports max-servers for the moment. Change-Id: I0ce742452914301552f4af5e92a3e36304a7e291
This commit is contained in:
parent
004f60eb8b
commit
9e9a5b9bfd
@ -147,7 +147,7 @@ class ProviderNotifications(object):
|
||||
pass
|
||||
|
||||
|
||||
class Provider(ProviderNotifications, metaclass=abc.ABCMeta):
|
||||
class Provider(ProviderNotifications):
|
||||
"""The Provider interface
|
||||
|
||||
Drivers implement this interface to supply Providers. Each
|
||||
@ -160,6 +160,9 @@ class Provider(ProviderNotifications, metaclass=abc.ABCMeta):
|
||||
The class or instance attribute **name** must be provided as a string.
|
||||
|
||||
"""
|
||||
def __init__(self, *args, **kw):
|
||||
super().__init__(*args, **kw)
|
||||
|
||||
@abc.abstractmethod
|
||||
def start(self, zk_conn):
|
||||
"""Start this provider
|
||||
|
@ -152,7 +152,7 @@ class OpenStackNodeLauncher(NodeLauncher):
|
||||
self.node.image_id = image_id
|
||||
|
||||
pool = self.handler.provider.pools.get(self.node.pool)
|
||||
resources = self.handler.manager.quotaNeededByNodeType(
|
||||
resources = self.handler.manager.quotaNeededByLabel(
|
||||
self.node.type[0], pool)
|
||||
self.node.resources = resources.quota['compute']
|
||||
if username:
|
||||
@ -340,7 +340,7 @@ class OpenStackNodeRequestHandler(NodeRequestHandler):
|
||||
return True
|
||||
|
||||
def hasRemainingQuota(self, ntype):
|
||||
needed_quota = self.manager.quotaNeededByNodeType(ntype, self.pool)
|
||||
needed_quota = self.manager.quotaNeededByLabel(ntype, self.pool)
|
||||
|
||||
if not self.pool.ignore_provider_quota:
|
||||
# Calculate remaining quota which is calculated as:
|
||||
@ -374,7 +374,7 @@ class OpenStackNodeRequestHandler(NodeRequestHandler):
|
||||
|
||||
for ntype in node_types:
|
||||
needed_quota.add(
|
||||
self.manager.quotaNeededByNodeType(ntype, self.pool))
|
||||
self.manager.quotaNeededByLabel(ntype, self.pool))
|
||||
|
||||
if not self.pool.ignore_provider_quota:
|
||||
cloud_quota = self.manager.estimatedNodepoolQuota()
|
||||
|
@ -14,7 +14,6 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import copy
|
||||
import logging
|
||||
import operator
|
||||
import os
|
||||
@ -24,7 +23,7 @@ import openstack
|
||||
|
||||
from nodepool import exceptions
|
||||
from nodepool.driver import Provider
|
||||
from nodepool.driver.utils import QuotaInformation
|
||||
from nodepool.driver.utils import QuotaInformation, QuotaSupport
|
||||
from nodepool.nodeutils import iterate_timeout
|
||||
from nodepool import stats
|
||||
from nodepool import version
|
||||
@ -35,19 +34,18 @@ from nodepool.driver.openstack import handler
|
||||
|
||||
|
||||
IPS_LIST_AGE = 5 # How long to keep a cached copy of the ip list
|
||||
MAX_QUOTA_AGE = 5 * 60 # How long to keep the quota information cached
|
||||
|
||||
|
||||
class OpenStackProvider(Provider):
|
||||
class OpenStackProvider(Provider, QuotaSupport):
|
||||
log = logging.getLogger("nodepool.driver.openstack.OpenStackProvider")
|
||||
|
||||
def __init__(self, provider):
|
||||
super().__init__()
|
||||
self.provider = provider
|
||||
self._images = {}
|
||||
self._networks = {}
|
||||
self.__flavors = {} # TODO(gtema): caching
|
||||
self.__azs = None
|
||||
self._current_nodepool_quota = None
|
||||
self._zk = None
|
||||
self._down_ports = set()
|
||||
self._last_port_cleanup = None
|
||||
@ -91,7 +89,11 @@ class OpenStackProvider(Provider):
|
||||
app_version=version.version_info.version_string()
|
||||
)
|
||||
|
||||
def quotaNeededByNodeType(self, ntype, pool):
|
||||
def getProviderLimits(self):
|
||||
limits = self._client.get_compute_limits()
|
||||
return QuotaInformation.construct_from_limits(limits)
|
||||
|
||||
def quotaNeededByLabel(self, ntype, pool):
|
||||
provider_label = pool.labels[ntype]
|
||||
|
||||
flavor = self.findFlavor(provider_label.flavor_name,
|
||||
@ -99,86 +101,6 @@ class OpenStackProvider(Provider):
|
||||
|
||||
return QuotaInformation.construct_from_flavor(flavor)
|
||||
|
||||
def estimatedNodepoolQuota(self):
|
||||
'''
|
||||
Determine how much quota is available for nodepool managed resources.
|
||||
This needs to take into account the quota of the tenant, resources
|
||||
used outside of nodepool and the currently used resources by nodepool,
|
||||
max settings in nodepool config. This is cached for MAX_QUOTA_AGE
|
||||
seconds.
|
||||
|
||||
:return: Total amount of resources available which is currently
|
||||
available to nodepool including currently existing nodes.
|
||||
'''
|
||||
|
||||
if self._current_nodepool_quota:
|
||||
now = time.time()
|
||||
if now < self._current_nodepool_quota['timestamp'] + MAX_QUOTA_AGE:
|
||||
return copy.deepcopy(self._current_nodepool_quota['quota'])
|
||||
|
||||
limits = self._client.get_compute_limits()
|
||||
|
||||
# This is initialized with the full tenant quota and later becomes
|
||||
# the quota available for nodepool.
|
||||
nodepool_quota = QuotaInformation.construct_from_limits(limits)
|
||||
self.log.debug("Provider quota for %s: %s",
|
||||
self.provider.name, nodepool_quota)
|
||||
|
||||
# Subtract the unmanaged quota usage from nodepool_max
|
||||
# to get the quota available for us.
|
||||
nodepool_quota.subtract(self.unmanagedQuotaUsed())
|
||||
|
||||
self._current_nodepool_quota = {
|
||||
'quota': nodepool_quota,
|
||||
'timestamp': time.time()
|
||||
}
|
||||
|
||||
self.log.debug("Available quota for %s: %s",
|
||||
self.provider.name, nodepool_quota)
|
||||
|
||||
return copy.deepcopy(nodepool_quota)
|
||||
|
||||
def invalidateQuotaCache(self):
|
||||
self._current_nodepool_quota['timestamp'] = 0
|
||||
|
||||
def estimatedNodepoolQuotaUsed(self, pool=None):
|
||||
'''
|
||||
Sums up the quota used (or planned) currently by nodepool. If pool is
|
||||
given it is filtered by the pool.
|
||||
|
||||
:param pool: If given, filtered by the pool.
|
||||
:return: Calculated quota in use by nodepool
|
||||
'''
|
||||
used_quota = QuotaInformation()
|
||||
|
||||
for node in self._zk.nodeIterator():
|
||||
if node.provider == self.provider.name:
|
||||
try:
|
||||
if pool and not node.pool == pool.name:
|
||||
continue
|
||||
provider_pool = self.provider.pools.get(node.pool)
|
||||
if not provider_pool:
|
||||
self.log.warning(
|
||||
"Cannot find provider pool for node %s" % node)
|
||||
# This node is in a funny state we log it for debugging
|
||||
# but move on and don't account it as we can't properly
|
||||
# calculate its cost without pool info.
|
||||
continue
|
||||
if node.type[0] not in provider_pool.labels:
|
||||
self.log.warning("Node type is not in provider pool "
|
||||
"for node %s" % node)
|
||||
# This node is also in a funny state; the config
|
||||
# may have changed under it. It should settle out
|
||||
# eventually when it's deleted.
|
||||
continue
|
||||
node_resources = self.quotaNeededByNodeType(
|
||||
node.type[0], provider_pool)
|
||||
used_quota.add(node_resources)
|
||||
except Exception:
|
||||
self.log.exception("Couldn't consider invalid node %s "
|
||||
"for quota:" % node)
|
||||
return used_quota
|
||||
|
||||
def unmanagedQuotaUsed(self):
|
||||
'''
|
||||
Sums up the quota used by servers unmanaged by nodepool.
|
||||
|
@ -18,7 +18,7 @@ import math
|
||||
|
||||
from nodepool.driver.taskmanager import BaseTaskManagerProvider, Task
|
||||
from nodepool.driver import Driver, NodeRequestHandler
|
||||
from nodepool.driver.utils import NodeLauncher, QuotaInformation
|
||||
from nodepool.driver.utils import NodeLauncher, QuotaInformation, QuotaSupport
|
||||
from nodepool.nodeutils import iterate_timeout, nodescan
|
||||
from nodepool import exceptions
|
||||
from nodepool import zk
|
||||
@ -199,18 +199,31 @@ class SimpleTaskManagerHandler(NodeRequestHandler):
|
||||
:param ntype: node type for the quota check
|
||||
:return: True if there is enough quota, False otherwise
|
||||
'''
|
||||
# TODO: Add support for real quota handling; this only handles
|
||||
# max_servers.
|
||||
needed_quota = QuotaInformation(cores=1, instances=1, ram=1, default=1)
|
||||
n_running = self.manager.countNodes(self.provider.name, self.pool.name)
|
||||
pool_quota = QuotaInformation(
|
||||
cores=math.inf,
|
||||
instances=self.pool.max_servers - n_running,
|
||||
ram=math.inf,
|
||||
default=math.inf)
|
||||
needed_quota = self.manager.quotaNeededByLabel(ntype, self.pool)
|
||||
|
||||
# Calculate remaining quota which is calculated as:
|
||||
# quota = <total nodepool quota> - <used quota> - <quota for node>
|
||||
cloud_quota = self.manager.estimatedNodepoolQuota()
|
||||
cloud_quota.subtract(
|
||||
self.manager.estimatedNodepoolQuotaUsed())
|
||||
cloud_quota.subtract(needed_quota)
|
||||
self.log.debug("Predicted remaining provider quota: %s",
|
||||
cloud_quota)
|
||||
|
||||
if not cloud_quota.non_negative():
|
||||
return False
|
||||
|
||||
# Now calculate pool specific quota. Values indicating no quota default
|
||||
# to math.inf representing infinity that can be calculated with.
|
||||
# TODO: add cores, ram
|
||||
pool_quota = QuotaInformation(instances=self.pool.max_servers,
|
||||
default=math.inf)
|
||||
pool_quota.subtract(
|
||||
self.manager.estimatedNodepoolQuotaUsed(self.pool))
|
||||
self.log.debug("Current pool quota: %s" % pool_quota)
|
||||
pool_quota.subtract(needed_quota)
|
||||
self.log.debug("hasRemainingQuota({},{}) = {}".format(
|
||||
self.pool, ntype, pool_quota))
|
||||
self.log.debug("Predicted remaining pool quota: %s", pool_quota)
|
||||
|
||||
return pool_quota.non_negative()
|
||||
|
||||
def launchesComplete(self):
|
||||
@ -243,7 +256,7 @@ class SimpleTaskManagerHandler(NodeRequestHandler):
|
||||
self._threads.append(thd)
|
||||
|
||||
|
||||
class SimpleTaskManagerProvider(BaseTaskManagerProvider):
|
||||
class SimpleTaskManagerProvider(BaseTaskManagerProvider, QuotaSupport):
|
||||
"""The Provider implementation for the SimpleTaskManager driver
|
||||
framework"""
|
||||
def __init__(self, adapter, provider):
|
||||
@ -263,6 +276,22 @@ class SimpleTaskManagerProvider(BaseTaskManagerProvider):
|
||||
def labelReady(self, label):
|
||||
return True
|
||||
|
||||
def getProviderLimits(self):
|
||||
# TODO: query the api to get real limits
|
||||
return QuotaInformation(
|
||||
cores=math.inf,
|
||||
instances=math.inf,
|
||||
ram=math.inf,
|
||||
default=math.inf)
|
||||
|
||||
def quotaNeededByLabel(self, ntype, pool):
|
||||
# TODO: return real quota information about a label
|
||||
return QuotaInformation(cores=1, instances=1, ram=1, default=1)
|
||||
|
||||
def unmanagedQuotaUsed(self):
|
||||
# TODO: return real quota information about quota
|
||||
return QuotaInformation()
|
||||
|
||||
def cleanupNode(self, external_id):
|
||||
instance = self.getInstance(external_id)
|
||||
if (not instance) or instance.deleted:
|
||||
|
@ -164,6 +164,7 @@ class BaseTaskManagerProvider(Provider):
|
||||
log = logging.getLogger("nodepool.driver.taskmanager.TaskManagerProvider")
|
||||
|
||||
def __init__(self, provider):
|
||||
super().__init__()
|
||||
self.provider = provider
|
||||
self.thread = None
|
||||
self.task_manager = TaskManager(provider.name, provider.rate_limit)
|
||||
|
@ -15,6 +15,7 @@
|
||||
# limitations under the License.
|
||||
|
||||
import abc
|
||||
import copy
|
||||
import logging
|
||||
import math
|
||||
import threading
|
||||
@ -28,6 +29,9 @@ from nodepool import zk
|
||||
from nodepool.logconfig import get_annotated_logger
|
||||
|
||||
|
||||
MAX_QUOTA_AGE = 5 * 60 # How long to keep the quota information cached
|
||||
|
||||
|
||||
class NodeLauncher(threading.Thread,
|
||||
stats.StatsReporter,
|
||||
metaclass=abc.ABCMeta):
|
||||
@ -171,3 +175,120 @@ class QuotaInformation:
|
||||
|
||||
def __str__(self):
|
||||
return str(self.quota)
|
||||
|
||||
|
||||
class QuotaSupport:
|
||||
"""A mix-in class for providers to supply quota support methods"""
|
||||
|
||||
def __init__(self, *args, **kw):
|
||||
super().__init__(*args, **kw)
|
||||
self._current_nodepool_quota = None
|
||||
|
||||
@abc.abstractmethod
|
||||
def quotaNeededByLabel(self, label, pool):
|
||||
"""Return quota information about a label
|
||||
|
||||
:param str label: The label name
|
||||
:param ProviderPool pool: A ProviderPool config object with the label
|
||||
|
||||
:return: QuotaInformation about the label
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def unmanagedQuotaUsed(self):
|
||||
'''
|
||||
Sums up the quota used by servers unmanaged by nodepool.
|
||||
|
||||
:return: Calculated quota in use by unmanaged servers
|
||||
'''
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def getProviderLimits(self):
|
||||
'''
|
||||
Get the resource limits from the provider.
|
||||
|
||||
:return: QuotaInformation about the label
|
||||
'''
|
||||
print("base")
|
||||
pass
|
||||
|
||||
def invalidateQuotaCache(self):
|
||||
self._current_nodepool_quota['timestamp'] = 0
|
||||
|
||||
def estimatedNodepoolQuota(self):
|
||||
'''
|
||||
Determine how much quota is available for nodepool managed resources.
|
||||
This needs to take into account the quota of the tenant, resources
|
||||
used outside of nodepool and the currently used resources by nodepool,
|
||||
max settings in nodepool config. This is cached for MAX_QUOTA_AGE
|
||||
seconds.
|
||||
|
||||
:return: Total amount of resources available which is currently
|
||||
available to nodepool including currently existing nodes.
|
||||
'''
|
||||
|
||||
if self._current_nodepool_quota:
|
||||
now = time.time()
|
||||
if now < self._current_nodepool_quota['timestamp'] + MAX_QUOTA_AGE:
|
||||
return copy.deepcopy(self._current_nodepool_quota['quota'])
|
||||
|
||||
# This is initialized with the full tenant quota and later becomes
|
||||
# the quota available for nodepool.
|
||||
nodepool_quota = self.getProviderLimits()
|
||||
|
||||
self.log.debug("Provider quota for %s: %s",
|
||||
self.provider.name, nodepool_quota)
|
||||
|
||||
# Subtract the unmanaged quota usage from nodepool_max
|
||||
# to get the quota available for us.
|
||||
nodepool_quota.subtract(self.unmanagedQuotaUsed())
|
||||
|
||||
self._current_nodepool_quota = {
|
||||
'quota': nodepool_quota,
|
||||
'timestamp': time.time()
|
||||
}
|
||||
|
||||
self.log.debug("Available quota for %s: %s",
|
||||
self.provider.name, nodepool_quota)
|
||||
|
||||
return copy.deepcopy(nodepool_quota)
|
||||
|
||||
def estimatedNodepoolQuotaUsed(self, pool=None):
|
||||
'''
|
||||
Sums up the quota used (or planned) currently by nodepool. If pool is
|
||||
given it is filtered by the pool.
|
||||
|
||||
:param pool: If given, filtered by the pool.
|
||||
:return: Calculated quota in use by nodepool
|
||||
'''
|
||||
used_quota = QuotaInformation()
|
||||
|
||||
for node in self._zk.nodeIterator():
|
||||
if node.provider == self.provider.name:
|
||||
try:
|
||||
if pool and not node.pool == pool.name:
|
||||
continue
|
||||
provider_pool = self.provider.pools.get(node.pool)
|
||||
if not provider_pool:
|
||||
self.log.warning(
|
||||
"Cannot find provider pool for node %s" % node)
|
||||
# This node is in a funny state we log it for debugging
|
||||
# but move on and don't account it as we can't properly
|
||||
# calculate its cost without pool info.
|
||||
continue
|
||||
if node.type[0] not in provider_pool.labels:
|
||||
self.log.warning("Node type is not in provider pool "
|
||||
"for node %s" % node)
|
||||
# This node is also in a funny state; the config
|
||||
# may have changed under it. It should settle out
|
||||
# eventually when it's deleted.
|
||||
continue
|
||||
node_resources = self.quotaNeededByLabel(
|
||||
node.type[0], provider_pool)
|
||||
used_quota.add(node_resources)
|
||||
except Exception:
|
||||
self.log.exception("Couldn't consider invalid node %s "
|
||||
"for quota:" % node)
|
||||
return used_quota
|
||||
|
Loading…
x
Reference in New Issue
Block a user