From acb6772c3ab9e6b5394a335d8b108110f8a46c1d Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Mon, 17 Jul 2023 14:52:11 -0700 Subject: [PATCH] Add AWS volume quota support Like the OpenStack driver, this automatically applies volume quota limits if specified in the label configuration. Change-Id: I71c1b95de08dc72cc777099952892de659d45d41 --- nodepool/driver/aws/adapter.py | 85 +++++++++++++++++-- .../tests/fixtures/aws/aws-volume-quota.yaml | 42 +++++++++ nodepool/tests/unit/test_driver_aws.py | 59 +++++++++++++ .../aws-volume-quota-90f42264bb9cee01.yaml | 6 ++ 4 files changed, 185 insertions(+), 7 deletions(-) create mode 100644 nodepool/tests/fixtures/aws/aws-volume-quota.yaml create mode 100644 releasenotes/notes/aws-volume-quota-90f42264bb9cee01.yaml diff --git a/nodepool/driver/aws/adapter.py b/nodepool/driver/aws/adapter.py index 1b9c8ce1d..42cd24b1a 100644 --- a/nodepool/driver/aws/adapter.py +++ b/nodepool/driver/aws/adapter.py @@ -83,6 +83,16 @@ QUOTA_CODES = { 'hpc': ['L-F7808C92', ''] } +VOLUME_QUOTA_CODES = { + 'io1': dict(iops='L-B3A130E6', storage='L-FD252861'), + 'io2': dict(iops='L-8D977E7E', storage='L-09BD8365'), + 'sc1': dict(storage='L-17AF77E8'), + 'gp2': dict(storage='L-D18FCD1D'), + 'gp3': dict(storage='L-7A658B76'), + 'standard': dict(storage='L-9CF3C2EB'), + 'st1': dict(storage='L-82ACEF56'), +} + CACHE_TTL = 10 ON_DEMAND = 0 SPOT = 1 @@ -194,9 +204,7 @@ class AwsCreateStateMachine(statemachine.StateMachine): if instance is None: return self.instance = instance - self.quota = self.adapter._getQuotaForInstanceType( - self.instance.instance_type, - SPOT if self.label.use_spot else ON_DEMAND) + self.quota = self.adapter.getQuotaForLabel(self.label) self.state = self.INSTANCE_CREATING if self.state == self.INSTANCE_CREATING: @@ -360,23 +368,31 @@ class AwsAdapter(statemachine.Adapter): self._deleteObject(resource.id) def listInstances(self): + volumes = {} + for volume in self._listVolumes(): + volumes[volume.volume_id] = volume for instance in self._listInstances(): if instance.state["Name"].lower() == "terminated": continue quota = self._getQuotaForInstanceType( instance.instance_type, SPOT if instance.instance_lifecycle == 'spot' else ON_DEMAND) + for attachment in instance.block_device_mappings: + volume = volumes.get(attachment['Ebs']['VolumeId']) + quota.add(self._getQuotaForVolume(volume)) yield AwsInstance(self.provider, instance, quota) def getQuotaLimits(self): - # Get the instance types that this provider handles + # Get the instance and volume types that this provider handles instance_types = {} + volume_types = set() for pool in self.provider.pools.values(): for label in pool.labels.values(): if label.instance_type not in instance_types: instance_types[label.instance_type] = set() instance_types[label.instance_type].add( SPOT if label.use_spot else ON_DEMAND) + volume_types.add(label.volume_type) args = dict(default=math.inf) for instance_type in instance_types: for market_type_option in instance_types[instance_type]: @@ -390,18 +406,46 @@ class AwsAdapter(statemachine.Adapter): instance_type) continue with self.non_mutating_rate_limiter: - self.log.debug("Getting quota limits for %s", code) + self.log.debug("Getting EC2 quota limits for %s", code) response = self.aws_quotas.get_service_quota( ServiceCode='ec2', QuotaCode=code, ) args[code] = response['Quota']['Value'] + for volume_type in volume_types: + vquota_codes = VOLUME_QUOTA_CODES.get(volume_type) + if not vquota_codes: + self.log.warning( + "Unknown quota code for volume type: %s", + volume_type) + continue + for resource, code in vquota_codes.items(): + if code in args: + continue + with self.non_mutating_rate_limiter: + self.log.debug("Getting EBS quota limits for %s", code) + response = self.aws_quotas.get_service_quota( + ServiceCode='ebs', + QuotaCode=code, + ) + value = response['Quota']['Value'] + # Unit mismatch: storage limit is in TB, but usage + # is in GB. Translate the limit to GB. + if resource == 'storage': + value *= 1000 + args[code] = value return QuotaInformation(**args) def getQuotaForLabel(self, label): - return self._getQuotaForInstanceType( + quota = self._getQuotaForInstanceType( label.instance_type, SPOT if label.use_spot else ON_DEMAND) + if label.volume_type: + quota.add(self._getQuotaForVolumeType( + label.volume_type, + storage=label.volume_size, + iops=label.iops)) + return quota def uploadImage(self, provider_image, image_name, filename, image_format, metadata, md5, sha256): @@ -788,6 +832,26 @@ class AwsAdapter(statemachine.Adapter): args = dict(cores=cores, ram=ram, instances=1) if code: args[code] = vcpus + + return QuotaInformation(**args) + + def _getQuotaForVolume(self, volume): + volume_type = volume.volume_type + vquota_codes = VOLUME_QUOTA_CODES.get(volume_type, {}) + args = {} + if 'iops' in vquota_codes and getattr(volume, 'iops', None): + args[vquota_codes['iops']] = volume.iops + if 'storage' in vquota_codes and getattr(volume, 'size', None): + args[vquota_codes['storage']] = volume.size + return QuotaInformation(**args) + + def _getQuotaForVolumeType(self, volume_type, storage=None, iops=None): + vquota_codes = VOLUME_QUOTA_CODES.get(volume_type, {}) + args = {} + if 'iops' in vquota_codes and iops is not None: + args[vquota_codes['iops']] = iops + if 'storage' in vquota_codes and storage is not None: + args[vquota_codes['storage']] = storage return QuotaInformation(**args) # This method is wrapped with an LRU cache in the constructor. @@ -900,7 +964,14 @@ class AwsAdapter(statemachine.Adapter): def _completeCreateInstance(self, future): if not future.done(): return None - return future.result() + try: + return future.result() + except botocore.exceptions.ClientError as error: + if error.response['Error']['Code'] == 'VolumeLimitExceeded': + # Re-raise as a quota exception so that the + # statemachine driver resets quota. + raise exceptions.QuotaException(str(error)) + raise def _createInstance(self, label, image_external_id, tags, hostname, log): diff --git a/nodepool/tests/fixtures/aws/aws-volume-quota.yaml b/nodepool/tests/fixtures/aws/aws-volume-quota.yaml new file mode 100644 index 000000000..96d40c3b6 --- /dev/null +++ b/nodepool/tests/fixtures/aws/aws-volume-quota.yaml @@ -0,0 +1,42 @@ +zookeeper-servers: + - host: {zookeeper_host} + port: {zookeeper_port} + chroot: {zookeeper_chroot} + +zookeeper-tls: + ca: {zookeeper_ca} + cert: {zookeeper_cert} + key: {zookeeper_key} + +labels: + - name: volume-gp2 + - name: volume-gp3 + +providers: + - name: ec2-us-west-2 + driver: aws + region-name: us-west-2 + cloud-images: + - name: ubuntu1404 + image-id: ami-1e749f67 + username: ubuntu + pools: + - name: main + subnet-id: {subnet_id} + security-group-id: {security_group_id} + node-attributes: + key1: value1 + key2: value2 + labels: + - name: volume-gp2 + cloud-image: ubuntu1404 + instance-type: t3.medium + key-name: zuul + volume-type: gp2 + volume-size: 1000 + - name: volume-gp3 + cloud-image: ubuntu1404 + instance-type: t3.medium + key-name: zuul + volume-type: gp3 + volume-size: 1000 diff --git a/nodepool/tests/unit/test_driver_aws.py b/nodepool/tests/unit/test_driver_aws.py index 7a6a0ff97..ea7e1f7d2 100644 --- a/nodepool/tests/unit/test_driver_aws.py +++ b/nodepool/tests/unit/test_driver_aws.py @@ -438,6 +438,65 @@ class TestDriverAws(tests.DBTestCase): # Assert that the second request is still being deferred req2 = self.waitForNodeRequest(req2, (zk.REQUESTED,)) + @aws_quotas({ + 'L-1216C47A': 200, # instance + 'L-D18FCD1D': 1.0, # gp2 storage (TB) + 'L-7A658B76': 1.0, # gp3 storage (TB) + }) + def test_aws_volume_quota(self): + # Test volume quotas + + # Moto doesn't correctly pass through iops when creating + # instances, so we can't test volume types that require iops. + # Therefore in this test we only cover storage quotas. + configfile = self.setup_config('aws/aws-volume-quota.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + self.startPool(pool) + + # Create an gp2 request + req1 = zk.NodeRequest() + req1.state = zk.REQUESTED + req1.node_types.append('volume-gp2') + self.zk.storeNodeRequest(req1) + self.log.debug("Waiting for request %s", req1.id) + req1 = self.waitForNodeRequest(req1) + node1 = self.assertSuccess(req1) + + # Create a second gp2 node request; this should be + # over quota so it won't be fulfilled. + req2 = zk.NodeRequest() + req2.state = zk.REQUESTED + req2.node_types.append('volume-gp2') + self.zk.storeNodeRequest(req2) + self.log.debug("Waiting for request %s", req2.id) + req2 = self.waitForNodeRequest(req2, (zk.PENDING,)) + + # Make sure we're paused while we attempt to fulfill the + # second request. + pool_worker = pool.getPoolWorkers('ec2-us-west-2') + for _ in iterate_timeout(30, Exception, 'paused handler'): + if pool_worker[0].paused_handlers: + break + + # Release the first node so that the second can be fulfilled. + node1.state = zk.USED + self.zk.storeNode(node1) + self.waitForNodeDeletion(node1) + + # Make sure the second high node exists now. + req2 = self.waitForNodeRequest(req2) + self.assertSuccess(req2) + + # Create a gp3 node request which should succeed even + # though we're at quota for gp2 (but not gp3). + req3 = zk.NodeRequest() + req3.state = zk.REQUESTED + req3.node_types.append('volume-gp3') + self.zk.storeNodeRequest(req3) + self.log.debug("Waiting for request %s", req3.id) + req3 = self.waitForNodeRequest(req3) + self.assertSuccess(req3) + def test_aws_node(self): req = self.requestNode('aws/aws.yaml', 'ubuntu1404') node = self.assertSuccess(req) diff --git a/releasenotes/notes/aws-volume-quota-90f42264bb9cee01.yaml b/releasenotes/notes/aws-volume-quota-90f42264bb9cee01.yaml new file mode 100644 index 000000000..77e50efc1 --- /dev/null +++ b/releasenotes/notes/aws-volume-quota-90f42264bb9cee01.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + The AWS driver now supports volume quota. It will automatically + register the limits from the cloud and ensure that labels that + specify EBS volume attributes stay under the limit.