From 207d8ac63ce9d0ffa78ce203a589d65789f352a8 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 28 Jun 2022 16:38:26 -0700 Subject: [PATCH] AWS multi quota support This adds support for AWS quotas that are specific to instance types. The current quota support in AWS assumes only the "standard" instance types, but AWS has several additional types with particular specialties (high memory, GPU, etc). This adds automatic support for those by encoding their service quota codes (like 'L-1216C47A') into the QuotaInformation object. QuotaInformation accepts not only cores, ram, and instances as resource values, but now also accepts arbitraly keys such as 'L-1216C47A'. Extra testing of QI is added to ensure we handle the arithmetic correctly in cases where one or the other operand does not have a resource counter. The statemachine drivers did not encode their resource information into the ZK Node record, so tenant quota was not operating correctly. This is now fixed. The AWS driver now accepts max_cores, _instances, and _ram values similar to the OpenStack driver. It additionally accepts max_resources which can be used to specify limits for arbitrary quotas like 'L-1216C47A'. The tenant quota system now also accepts arbitrary keys such as 'L-1216C47A' so that, for example, high memory nodes may be limited by tenant. The mapping of instance types to quota is manually maintained, however, AWS doesn't seem to add new instance types too often, and those it does are highly specialized. If a new instance type is not handled internally, the driver will not be able to calculate expected quota usage, but will still operate until the new type is added to the mapping. Change-Id: Iefdc8f3fb8249c61c43fe51b592f551e273f9c36 --- doc/source/aws.rst | 73 +++++++++ doc/source/configuration.rst | 7 +- nodepool/cmd/config_validator.py | 1 + nodepool/config.py | 19 +-- nodepool/driver/aws/adapter.py | 86 ++++++++-- nodepool/driver/aws/config.py | 23 +++ nodepool/driver/statemachine.py | 19 ++- nodepool/driver/utils.py | 13 +- nodepool/tests/fixtures/aws/aws-limits.yaml | 46 ++++++ nodepool/tests/fixtures/aws/aws-quota.yaml | 43 +++++ nodepool/tests/unit/test_driver_aws.py | 153 +++++++++++++++++- nodepool/tests/unit/test_utils.py | 68 ++++++++ .../aws-multi-quota-fbddefb56d0694a4.yaml | 7 + 13 files changed, 524 insertions(+), 34 deletions(-) create mode 100644 nodepool/tests/fixtures/aws/aws-limits.yaml create mode 100644 nodepool/tests/fixtures/aws/aws-quota.yaml create mode 100644 nodepool/tests/unit/test_utils.py create mode 100644 releasenotes/notes/aws-multi-quota-fbddefb56d0694a4.yaml diff --git a/doc/source/aws.rst b/doc/source/aws.rst index d2f611c81..991d21322 100644 --- a/doc/source/aws.rst +++ b/doc/source/aws.rst @@ -109,6 +109,42 @@ Selecting the ``aws`` driver adds the following options to the until that instance is reported as "active". If the timeout is exceeded, the node launch is aborted and the instance deleted. + .. attr:: max-cores + :type: int + :default: unlimited + + Maximum number of cores usable from this provider's pools by default. + + .. attr:: max-servers + :type: int + :default: unlimited + + Maximum number of servers spawnable from this provider's pools by default. + + .. attr:: max-ram + :type: int + :default: unlimited + + Maximum RAM usable from this provider's pools by default. + + .. attr:: max-resources + :type: dict + :default: unlimited + + A dictionary of other quota resource limits. AWS has quotas + for certain instance types. These may be specified here to + limit Nodepool's usage. + + The following example limits the number of high-memory + instance cores: + + .. code-block:: yaml + + max-resources: + 'L-43DA4232': 224 + + See `instance quotas`_ for more information. + .. attr:: launch-retries :default: 3 @@ -379,6 +415,42 @@ Selecting the ``aws`` driver adds the following options to the A dictionary of key-value pairs that will be stored with the node data in ZooKeeper. The keys and values can be any arbitrary string. + .. attr:: max-cores + :type: int + + Maximum number of cores usable from this pool. Defaults to + :attr:`providers.[aws].max-cores`. + + .. attr:: max-servers + :type: int + + Maximum number of servers spawnable from this pool. Defaults to + :attr:`providers.[aws].max-servers`. + + .. attr:: max-ram + :type: int + + Maximum RAM usable from this pool. Defaults to + :attr:`providers.[aws].max-ram`. + + .. attr:: max-resources + :type: dict + + A dictionary of other quota resource limits. AWS has quotas + for certain instance types. These may be specified here to + limit Nodepool's usage. Defaults to + :attr:`providers.[aws].max-resources`. + + The following example limits the number of high-memory + instance cores: + + .. code-block:: yaml + + max-resources: + 'L-43DA4232': 224 + + See `instance quotas`_ for more information. + .. attr:: subnet-id If provided, specifies the subnet to assign to the primary network @@ -538,3 +610,4 @@ Selecting the ``aws`` driver adds the following options to the .. _`Boto configuration`: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html .. _`Boto describe images`: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.Client.describe_images .. _`VM Import/Export service role`: https://docs.aws.amazon.com/vm-import/latest/userguide/vmie_prereqs.html#vmimport-role +.. _`instance quotas`: https://us-west-1.console.aws.amazon.com/servicequotas/home/services/ec2/quotas diff --git a/doc/source/configuration.rst b/doc/source/configuration.rst index 435eebdff..d3cfc309c 100644 --- a/doc/source/configuration.rst +++ b/doc/source/configuration.rst @@ -542,8 +542,13 @@ Options max-servers: 10 max-cores: 200 max-ram: 16565 + 'L-43DA4232': 224 - Each entry is a dictionary with the following keys. + Each entry is a dictionary with the following keys. Any other keys + are interpreted as driver-specific resource limits (otherwise + specified as ``max-resources`` in the provider configuration). The + only driver that currently supports additional resource limits is + AWS. .. attr:: tenant-name :type: str diff --git a/nodepool/cmd/config_validator.py b/nodepool/cmd/config_validator.py index c06aa7043..6212d064b 100644 --- a/nodepool/cmd/config_validator.py +++ b/nodepool/cmd/config_validator.py @@ -67,6 +67,7 @@ class ConfigValidator: 'max-cores': int, 'max-ram': int, 'max-servers': int, + str: int, } top_level = { diff --git a/nodepool/config.py b/nodepool/config.py index 3d6ccafe5..0e08ea74f 100644 --- a/nodepool/config.py +++ b/nodepool/config.py @@ -250,19 +250,14 @@ class Config(ConfigValue): if not tenant_resource_limits_cfg: return for resource_limit in tenant_resource_limits_cfg: - tenant_name = resource_limit['tenant-name'] - max_cores = resource_limit.get('max-cores') - max_ram = resource_limit.get('max-ram') - max_servers = resource_limit.get('max-servers') - + resource_limit = resource_limit.copy() + tenant_name = resource_limit.pop('tenant-name') limits = {} - if max_cores: - limits['cores'] = max_cores - if max_servers: - limits['instances'] = max_servers - if max_ram: - limits['ram'] = max_ram - + limits['cores'] = resource_limit.pop('max-cores', math.inf) + limits['instances'] = resource_limit.pop('max-servers', math.inf) + limits['ram'] = resource_limit.pop('max-ram', math.inf) + for k, v in resource_limit.items(): + limits[k] = v self.tenant_resource_limits[tenant_name] = limits diff --git a/nodepool/driver/aws/adapter.py b/nodepool/driver/aws/adapter.py index 1110658f1..8e1ec84cf 100644 --- a/nodepool/driver/aws/adapter.py +++ b/nodepool/driver/aws/adapter.py @@ -42,6 +42,41 @@ def tag_list_to_dict(taglist): return {t["Key"]: t["Value"] for t in taglist} +# This is a map of instance types to quota codes. There does not +# appear to be an automated way to determine what quota code to use +# for an instance type, therefore this list was manually created by +# visiting +# https://us-west-1.console.aws.amazon.com/servicequotas/home/services/ec2/quotas +# and filtering by "Instances". An example description is "Running +# On-Demand P instances" which we can infer means we should use that +# quota code for instance types starting with the letter "p". All +# instance type names follow the format "([a-z\-]+)\d", so we can +# match the first letters (up to the first number) of the instance +# type name with the letters in the quota name. The prefix "u-" for +# "Running On-Demand High Memory instances" was determined from +# https://aws.amazon.com/ec2/instance-types/high-memory/ + +QUOTA_CODES = { + 'a': 'L-1216C47A', + 'c': 'L-1216C47A', + 'd': 'L-1216C47A', + 'h': 'L-1216C47A', + 'i': 'L-1216C47A', + 'm': 'L-1216C47A', + 'r': 'L-1216C47A', + 't': 'L-1216C47A', + 'z': 'L-1216C47A', + 'dl': 'L-6E869C2A', + 'f': 'L-74FC7D96', + 'g': 'L-DB2E81BA', + 'vt': 'L-DB2E81BA', + 'u-': 'L-43DA4232', # 'high memory' + 'inf': 'L-1945791B', + 'p': 'L-417A185B', + 'x': 'L-7295265B', +} + + class AwsInstance(statemachine.Instance): def __init__(self, instance, quota): super().__init__() @@ -293,15 +328,28 @@ class AwsAdapter(statemachine.Adapter): yield AwsInstance(instance, quota) def getQuotaLimits(self): - with self.non_mutating_rate_limiter: - self.log.debug("Getting quota limits") - response = self.aws_quotas.get_service_quota( - ServiceCode='ec2', - QuotaCode='L-1216C47A' - ) - cores = response['Quota']['Value'] - return QuotaInformation(cores=cores, - default=math.inf) + # Get the instance types that this provider handles + instance_types = set() + for pool in self.provider.pools.values(): + for label in pool.labels.values(): + instance_types.add(label.instance_type) + args = dict(default=math.inf) + for instance_type in instance_types: + code = self._getQuotaCodeForInstanceType(instance_type) + if code in args: + continue + if not code: + self.log.warning("Unknown quota code for instance type: %s", + instance_type) + continue + with self.non_mutating_rate_limiter: + self.log.debug("Getting quota limits for %s", code) + response = self.aws_quotas.get_service_quota( + ServiceCode='ec2', + QuotaCode=code, + ) + args[code] = response['Quota']['Value'] + return QuotaInformation(**args) def getQuotaForLabel(self, label): return self._getQuotaForInstanceType(label.instance_type) @@ -454,13 +502,27 @@ class AwsAdapter(statemachine.Adapter): # Return the first and only task return task + instance_key_re = re.compile(r'([a-z\-]+)\d.*') + + def _getQuotaCodeForInstanceType(self, instance_type): + m = self.instance_key_re.match(instance_type) + if m: + key = m.group(1) + return QUOTA_CODES.get(key) + def _getQuotaForInstanceType(self, instance_type): itype = self._getInstanceType(instance_type) cores = itype['InstanceTypes'][0]['VCpuInfo']['DefaultCores'] ram = itype['InstanceTypes'][0]['MemoryInfo']['SizeInMiB'] - return QuotaInformation(cores=cores, - ram=ram, - instances=1) + code = self._getQuotaCodeForInstanceType(instance_type) + # We include cores twice: one to match the overall cores quota + # (which may be set as a tenant resource limit), and a second + # time as the specific AWS quota code which in for a specific + # instance type. + args = dict(cores=cores, ram=ram, instances=1) + if code: + args[code] = cores + return QuotaInformation(**args) @cachetools.func.lru_cache(maxsize=None) def _getInstanceType(self, instance_type): diff --git a/nodepool/driver/aws/config.py b/nodepool/driver/aws/config.py index 71ad5a246..d94c5c1e8 100644 --- a/nodepool/driver/aws/config.py +++ b/nodepool/driver/aws/config.py @@ -15,6 +15,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from collections import defaultdict +import math + import voluptuous as v from nodepool.driver import ConfigPool @@ -203,6 +206,13 @@ class AwsPool(ConfigPool): 'use-internal-ip', self.provider.use_internal_ip) self.host_key_checking = pool_config.get( 'host-key-checking', self.provider.host_key_checking) + self.max_servers = pool_config.get( + 'max-servers', self.provider.max_servers) + self.max_cores = pool_config.get('max-cores', self.provider.max_cores) + self.max_ram = pool_config.get('max-ram', self.provider.max_ram) + self.max_resources = self.provider.max_resources.copy() + for k, val in pool_config.get('max-resources', {}).items(): + self.max_resources[k] = val @staticmethod def getSchema(): @@ -218,6 +228,9 @@ class AwsPool(ConfigPool): 'public-ipv4': bool, 'public-ipv6': bool, 'host-key-checking': bool, + 'max-cores': int, + 'max-ram': int, + 'max-resources': {str: int}, }) return pool @@ -263,6 +276,12 @@ class AwsProviderConfig(ProviderConfig): self.image_type = self.provider.get('image-format', 'raw') self.image_name_format = '{image_name}-{timestamp}' self.post_upload_hook = self.provider.get('post-upload-hook') + self.max_servers = self.provider.get('max-servers', math.inf) + self.max_cores = self.provider.get('max-cores', math.inf) + self.max_ram = self.provider.get('max-ram', math.inf) + self.max_resources = defaultdict(lambda: math.inf) + for k, val in self.provider.get('max-resources', {}).items(): + self.max_resources[k] = val self.cloud_images = {} for image in self.provider.get('cloud-images', []): @@ -305,6 +324,10 @@ class AwsProviderConfig(ProviderConfig): 'launch-retries': int, 'object-storage': object_storage, 'image-format': v.Any('ova', 'vhd', 'vhdx', 'vmdk', 'raw'), + 'max-servers': int, + 'max-cores': int, + 'max-ram': int, + 'max-resources': {str: int}, }) return v.Schema(provider) diff --git a/nodepool/driver/statemachine.py b/nodepool/driver/statemachine.py index fd4a636de..482380681 100644 --- a/nodepool/driver/statemachine.py +++ b/nodepool/driver/statemachine.py @@ -123,6 +123,10 @@ class StateMachineNodeLauncher(stats.StatsReporter): self.node.shell_type = image.shell_type self.node.connection_port = image.connection_port self.node.connection_type = image.connection_type + qi = self.manager.quotaNeededByLabel(label.name, self.handler.pool) + if qi: + self.node.resources = qi.get_resources() + self.zk.storeNode(self.node) # Windows computer names can be no more than 15 chars long. @@ -386,11 +390,14 @@ class StateMachineHandler(NodeRequestHandler): # Now calculate pool specific quota. Values indicating no quota default # to math.inf representing infinity that can be calculated with. - pool_quota = QuotaInformation( + args = dict( cores=getattr(self.pool, 'max_cores', None), instances=self.pool.max_servers, ram=getattr(self.pool, 'max_ram', None), - default=math.inf) + default=math.inf, + ) + args.update(getattr(self.pool, 'max_resources', {})) + pool_quota = QuotaInformation(**args) pool_quota.subtract(needed_quota) return pool_quota.non_negative() @@ -403,6 +410,7 @@ class StateMachineHandler(NodeRequestHandler): :return: True if there is enough quota, False otherwise ''' needed_quota = self.manager.quotaNeededByLabel(ntype, self.pool) + self.log.debug("Needed quota: %s", needed_quota) # Calculate remaining quota which is calculated as: # quota = - - @@ -418,11 +426,14 @@ class StateMachineHandler(NodeRequestHandler): # Now calculate pool specific quota. Values indicating no quota default # to math.inf representing infinity that can be calculated with. - pool_quota = QuotaInformation( + args = dict( cores=getattr(self.pool, 'max_cores', None), instances=self.pool.max_servers, ram=getattr(self.pool, 'max_ram', None), - default=math.inf) + default=math.inf, + ) + args.update(getattr(self.pool, 'max_resources', {})) + pool_quota = QuotaInformation(**args) pool_quota.subtract( self.manager.estimatedNodepoolQuotaUsed(self.pool)) self.log.debug("Current pool quota: %s" % pool_quota) diff --git a/nodepool/driver/utils.py b/nodepool/driver/utils.py index 0df42a494..80b607709 100644 --- a/nodepool/driver/utils.py +++ b/nodepool/driver/utils.py @@ -1,4 +1,5 @@ # Copyright (C) 2018 Red Hat +# Copyright 2022 Acme Gating, LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -179,7 +180,7 @@ class NodeDeleter(threading.Thread): class QuotaInformation: - def __init__(self, cores=None, instances=None, ram=None, default=0): + def __init__(self, cores=None, instances=None, ram=None, default=0, **kw): ''' Initializes the quota information with some values. None values will be initialized with default which will be typically 0 or math.inf @@ -202,6 +203,9 @@ class QuotaInformation: 'ram': self._get_default(ram, default), } } + for k, v in kw.items(): + self.quota['compute'][k] = v + self.default = default @staticmethod def construct_from_flavor(flavor): @@ -225,9 +229,14 @@ class QuotaInformation: return value if value is not None else default def _add_subtract(self, other, add=True): + for category in other.quota.keys(): + self.quota.setdefault(category, {}) + for resource in other.quota[category].keys(): + self.quota[category].setdefault(resource, self.default) for category in self.quota.keys(): for resource in self.quota[category].keys(): - second_value = other.quota.get(category, {}).get(resource, 0) + second_value = other.quota.get(category, {}).get( + resource, other.default) if add: self.quota[category][resource] += second_value else: diff --git a/nodepool/tests/fixtures/aws/aws-limits.yaml b/nodepool/tests/fixtures/aws/aws-limits.yaml new file mode 100644 index 000000000..089d068cc --- /dev/null +++ b/nodepool/tests/fixtures/aws/aws-limits.yaml @@ -0,0 +1,46 @@ +zookeeper-servers: + - host: {zookeeper_host} + port: {zookeeper_port} + chroot: {zookeeper_chroot} + +zookeeper-tls: + ca: {zookeeper_ca} + cert: {zookeeper_cert} + key: {zookeeper_key} + +tenant-resource-limits: + - tenant-name: tenant-1 + max-cores: 1024 + 'L-43DA4232': 224 # high mem cores + +labels: + - name: standard + - name: high + +providers: + - name: ec2-us-west-2 + driver: aws + region-name: us-west-2 + cloud-images: + - name: ubuntu1404 + image-id: ami-1e749f67 + username: ubuntu + pools: + - name: main + max-servers: 10 + subnet-id: {subnet_id} + security-group-id: {security_group_id} + node-attributes: + key1: value1 + key2: value2 + max-resources: + 'L-1216C47A': 1 # standard cores + labels: + - name: standard + cloud-image: ubuntu1404 + instance-type: t3.medium + key-name: zuul + - name: high + cloud-image: ubuntu1404 + instance-type: u-6tb1.112xlarge + key-name: zuul diff --git a/nodepool/tests/fixtures/aws/aws-quota.yaml b/nodepool/tests/fixtures/aws/aws-quota.yaml new file mode 100644 index 000000000..9dce1c959 --- /dev/null +++ b/nodepool/tests/fixtures/aws/aws-quota.yaml @@ -0,0 +1,43 @@ +zookeeper-servers: + - host: {zookeeper_host} + port: {zookeeper_port} + chroot: {zookeeper_chroot} + +zookeeper-tls: + ca: {zookeeper_ca} + cert: {zookeeper_cert} + key: {zookeeper_key} + +tenant-resource-limits: + - tenant-name: tenant-1 + max-cores: 1024 + +labels: + - name: standard + - name: high + +providers: + - name: ec2-us-west-2 + driver: aws + region-name: us-west-2 + cloud-images: + - name: ubuntu1404 + image-id: ami-1e749f67 + username: ubuntu + pools: + - name: main + max-servers: 10 + subnet-id: {subnet_id} + security-group-id: {security_group_id} + node-attributes: + key1: value1 + key2: value2 + labels: + - name: standard + cloud-image: ubuntu1404 + instance-type: t3.medium + key-name: zuul + - name: high + cloud-image: ubuntu1404 + instance-type: u-6tb1.112xlarge + key-name: zuul diff --git a/nodepool/tests/unit/test_driver_aws.py b/nodepool/tests/unit/test_driver_aws.py index 3290d78ac..3e0b5beb1 100644 --- a/nodepool/tests/unit/test_driver_aws.py +++ b/nodepool/tests/unit/test_driver_aws.py @@ -114,7 +114,8 @@ class TestDriverAws(tests.DBTestCase): kw['security_group_id'] = self.security_group_id return super().setup_config(*args, **kw) - def patchProvider(self, nodepool, provider_name='ec2-us-west-2'): + def patchProvider(self, nodepool, provider_name='ec2-us-west-2', + quotas=None): for _ in iterate_timeout( 30, Exception, 'wait for provider'): try: @@ -138,10 +139,13 @@ class TestDriverAws(tests.DBTestCase): _fake_create_instances # moto does not mock service-quotas, so we do it ourselves: - def _fake_get_service_quota(*args, **kwargs): + def _fake_get_service_quota(ServiceCode, QuotaCode, *args, **kwargs): # This is a simple fake that only returns the number # of cores. - return {'Quota': {'Value': 100}} + if quotas is None: + return {'Quota': {'Value': 100}} + else: + return {'Quota': {'Value': quotas.get(QuotaCode)}} provider_manager.adapter.aws_quotas.get_service_quota =\ _fake_get_service_quota @@ -204,6 +208,149 @@ class TestDriverAws(tests.DBTestCase): for node in nodes: self.waitForNodeDeletion(node) + def test_aws_multi_quota(self): + # Test multiple instance type quotas (standard and high-mem) + configfile = self.setup_config('aws/aws-quota.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + pool.start() + self.patchProvider(pool, quotas={ + 'L-1216C47A': 1, + 'L-43DA4232': 224, + }) + + # Create a high-memory node request. + req1 = zk.NodeRequest() + req1.state = zk.REQUESTED + req1.node_types.append('high') + self.zk.storeNodeRequest(req1) + self.log.debug("Waiting for request %s", req1.id) + req1 = self.waitForNodeRequest(req1) + node1 = self.assertSuccess(req1) + + # Create a second high-memory node request; this should be + # over quota so it won't be fulfilled. + req2 = zk.NodeRequest() + req2.state = zk.REQUESTED + req2.node_types.append('high') + self.zk.storeNodeRequest(req2) + self.log.debug("Waiting for request %s", req2.id) + req2 = self.waitForNodeRequest(req2, (zk.PENDING,)) + + # Make sure we're paused while we attempt to fulfill the + # second request. + pool_worker = pool.getPoolWorkers('ec2-us-west-2') + for _ in iterate_timeout(30, Exception, 'paused handler'): + if pool_worker[0].paused_handler: + break + + # Release the first node so that the second can be fulfilled. + node1.state = zk.USED + self.zk.storeNode(node1) + self.waitForNodeDeletion(node1) + + # Make sure the second high node exists now. + req2 = self.waitForNodeRequest(req2) + self.assertSuccess(req2) + + # Create a standard node request which should succeed even + # though we're at quota for high-mem (but not standard). + req3 = zk.NodeRequest() + req3.state = zk.REQUESTED + req3.node_types.append('standard') + self.zk.storeNodeRequest(req3) + self.log.debug("Waiting for request %s", req3.id) + req3 = self.waitForNodeRequest(req3) + self.assertSuccess(req3) + + def test_aws_multi_pool_limits(self): + # Test multiple instance type quotas (standard and high-mem) + # with pool resource limits + configfile = self.setup_config('aws/aws-limits.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + pool.start() + self.patchProvider(pool, quotas={ + 'L-1216C47A': 1000, + 'L-43DA4232': 1000, + }) + + # Create a standard node request. + req1 = zk.NodeRequest() + req1.state = zk.REQUESTED + req1.node_types.append('standard') + self.zk.storeNodeRequest(req1) + self.log.debug("Waiting for request %s", req1.id) + req1 = self.waitForNodeRequest(req1) + node1 = self.assertSuccess(req1) + + # Create a second standard node request; this should be + # over max-cores so it won't be fulfilled. + req2 = zk.NodeRequest() + req2.state = zk.REQUESTED + req2.node_types.append('standard') + self.zk.storeNodeRequest(req2) + self.log.debug("Waiting for request %s", req2.id) + req2 = self.waitForNodeRequest(req2, (zk.PENDING,)) + + # Make sure we're paused while we attempt to fulfill the + # second request. + pool_worker = pool.getPoolWorkers('ec2-us-west-2') + for _ in iterate_timeout(30, Exception, 'paused handler'): + if pool_worker[0].paused_handler: + break + + # Release the first node so that the second can be fulfilled. + node1.state = zk.USED + self.zk.storeNode(node1) + self.waitForNodeDeletion(node1) + + # Make sure the second standard node exists now. + req2 = self.waitForNodeRequest(req2) + self.assertSuccess(req2) + + def test_aws_multi_tenant_limits(self): + # Test multiple instance type quotas (standard and high-mem) + # with tenant resource limits + configfile = self.setup_config('aws/aws-limits.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + pool.start() + self.patchProvider(pool, quotas={ + 'L-1216C47A': 1000, + 'L-43DA4232': 1000, + }) + + # Create a high node request. + req1 = zk.NodeRequest() + req1.state = zk.REQUESTED + req1.tenant_name = 'tenant-1' + req1.node_types.append('high') + self.zk.storeNodeRequest(req1) + self.log.debug("Waiting for request %s", req1.id) + req1 = self.waitForNodeRequest(req1) + self.assertSuccess(req1) + + # Create a second high node request; this should be + # over quota so it won't be fulfilled. + req2 = zk.NodeRequest() + req2.state = zk.REQUESTED + req2.tenant_name = 'tenant-1' + req2.node_types.append('high') + self.zk.storeNodeRequest(req2) + req2 = self.waitForNodeRequest(req2, (zk.REQUESTED,)) + + # Create a standard node request which should succeed even + # though we're at quota for high-mem (but not standard). + req3 = zk.NodeRequest() + req3.state = zk.REQUESTED + req3.tenant_name = 'tenant-1' + req3.node_types.append('standard') + self.zk.storeNodeRequest(req3) + self.log.debug("Waiting for request %s", req3.id) + req3 = self.waitForNodeRequest(req3) + self.assertSuccess(req3) + + # Assert that the second request is still being deferred + req2 = self.waitForNodeRequest(req2, (zk.REQUESTED,)) + def test_aws_node(self): req = self.requestNode('aws/aws.yaml', 'ubuntu1404') node = self.assertSuccess(req) diff --git a/nodepool/tests/unit/test_utils.py b/nodepool/tests/unit/test_utils.py new file mode 100644 index 000000000..8d1f4ce63 --- /dev/null +++ b/nodepool/tests/unit/test_utils.py @@ -0,0 +1,68 @@ +# Copyright 2022 Acme Gating, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import copy +import math + +from nodepool import tests +from nodepool.driver.utils import QuotaInformation + + +class TestQutoInformation(tests.BaseTestCase): + def test_subtract(self): + provider = QuotaInformation(cores=8, ram=8192, default=math.inf) + needed = QuotaInformation(cores=2, instances=1) + expected = QuotaInformation(cores=6, instances=math.inf, ram=8192) + + remain = copy.deepcopy(provider) + remain.subtract(needed) + + self.assertEqual(expected.quota, remain.quota) + + def test_add(self): + label1 = QuotaInformation(cores=8, ram=8192) + label2 = QuotaInformation(cores=2, instances=1) + + needed = copy.deepcopy(label1) + needed.add(label2) + expected = QuotaInformation(cores=10, instances=1, ram=8192) + self.assertEqual(expected.quota, needed.quota) + + def test_extra(self): + # Test extra quota fields + + # We call them red_, blue_, green_ + # cores here. They are arbitrary names other than the + # standard cores, ram, instances. + label1 = QuotaInformation(cores=8, ram=8192, + red_cores=8, green_cores=8) + label2 = QuotaInformation(cores=2, instances=1, blue_cores=2) + + needed = copy.deepcopy(label1) + needed.add(label2) + expected = QuotaInformation(cores=10, instances=1, ram=8192, + red_cores=8, blue_cores=2, + green_cores=8) + self.assertEqual(expected.quota, needed.quota) + + provider = QuotaInformation(cores=8, ram=8192, default=math.inf, + green_cores=16) + expected = QuotaInformation(cores=-2, instances=math.inf, ram=0, + red_cores=math.inf, blue_cores=math.inf, + green_cores=8) + + remain = copy.deepcopy(provider) + remain.subtract(needed) + + self.assertEqual(expected.quota, remain.quota) diff --git a/releasenotes/notes/aws-multi-quota-fbddefb56d0694a4.yaml b/releasenotes/notes/aws-multi-quota-fbddefb56d0694a4.yaml new file mode 100644 index 000000000..acebd17cf --- /dev/null +++ b/releasenotes/notes/aws-multi-quota-fbddefb56d0694a4.yaml @@ -0,0 +1,7 @@ +--- +features: + - | + The AWS driver now supports multiple quotas for specific instance + types. This support is automatic, but also includes corresponding + enhancements to provider, pool, and tenant limits configured in + Nodepool.