From c2d9c45655cc0417bff693656184170ce7598033 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 10 Aug 2023 15:58:42 -0700 Subject: [PATCH] AWS: Add support for retrying image imports AWS has limits on the number of image import tasks that can run simultaneously. In a busy system with large images, it would be better to wait until those limits clear rather than delete the uploaded s3 object and start over, uploading it again. To support this, we now detect that condition and optionally retry for a specified amount of time. The default remains to bail on the first error. Change-Id: I6aa7f79b2f73c4aa6743f11221907a731a82be34 --- doc/source/aws.rst | 11 +++ nodepool/driver/aws/adapter.py | 96 ++++++++++++------- nodepool/driver/aws/config.py | 3 + .../fixtures/aws/diskimage-import-image.yaml | 1 + nodepool/tests/fixtures/aws/diskimage.yaml | 1 + nodepool/tests/unit/fake_aws.py | 12 +++ nodepool/tests/unit/test_driver_aws.py | 2 + ...image-import-timeout-d53aaa1a921f7aa5.yaml | 7 ++ 8 files changed, 100 insertions(+), 33 deletions(-) create mode 100644 releasenotes/notes/aws-image-import-timeout-d53aaa1a921f7aa5.yaml diff --git a/doc/source/aws.rst b/doc/source/aws.rst index 71a61c9da..c53e708ce 100644 --- a/doc/source/aws.rst +++ b/doc/source/aws.rst @@ -186,6 +186,17 @@ Selecting the ``aws`` driver adds the following options to the ``ova``, ``vhd``, ``vhdx``, ``vmdk``, ``raw`` (not all of which are supported by diskimage-builder). + .. attr:: image-import-timeout + :type: int + + Generally there is no limit on the amount of time a successful + image import can take. However, some import tasks may encounter + temporary resource limitations from AWS. In these cases, if + this value is set, Nodepool will retry the import tasks until + the timeout is reached. If this is unset (the default), then + the first resource limitation detected will result in an error. + The value is in seconds. + .. attr:: cloud-images :type: list diff --git a/nodepool/driver/aws/adapter.py b/nodepool/driver/aws/adapter.py index 1b9c8ce1d..b7dffb96c 100644 --- a/nodepool/driver/aws/adapter.py +++ b/nodepool/driver/aws/adapter.py @@ -434,22 +434,37 @@ class AwsAdapter(statemachine.Adapter): bucket_name, object_filename): # Import snapshot self.log.debug(f"Importing {image_name} as snapshot") - with self.rate_limiter: - import_snapshot_task = self.ec2_client.import_snapshot( - DiskContainer={ - 'Format': image_format, - 'UserBucket': { - 'S3Bucket': bucket_name, - 'S3Key': object_filename, - }, - }, - TagSpecifications=[ - { - 'ResourceType': 'import-snapshot-task', - 'Tags': tag_dict_to_list(metadata), - }, - ] - ) + timeout = time.time() + if self.provider.image_import_timeout: + timeout += self.provider.image_import_timeout + while True: + try: + with self.rate_limiter: + import_snapshot_task = self.ec2_client.import_snapshot( + DiskContainer={ + 'Format': image_format, + 'UserBucket': { + 'S3Bucket': bucket_name, + 'S3Key': object_filename, + }, + }, + TagSpecifications=[ + { + 'ResourceType': 'import-snapshot-task', + 'Tags': tag_dict_to_list(metadata), + }, + ] + ) + break + except botocore.exceptions.ClientError as error: + if (error.response['Error']['Code'] == + 'ResourceCountLimitExceeded'): + if time.time() < timeout: + self.log.warning("AWS error: '%s' will retry", + str(error)) + time.sleep(self.IMAGE_UPLOAD_SLEEP) + continue + raise task_id = import_snapshot_task['ImportTaskId'] paginator = self.ec2_client.get_paginator( @@ -527,23 +542,38 @@ class AwsAdapter(statemachine.Adapter): bucket_name, object_filename): # Import image as AMI self.log.debug(f"Importing {image_name} as AMI") - with self.rate_limiter: - import_image_task = self.ec2_client.import_image( - Architecture=provider_image.architecture, - DiskContainers=[{ - 'Format': image_format, - 'UserBucket': { - 'S3Bucket': bucket_name, - 'S3Key': object_filename, - }, - }], - TagSpecifications=[ - { - 'ResourceType': 'import-image-task', - 'Tags': tag_dict_to_list(metadata), - }, - ] - ) + timeout = time.time() + if self.provider.image_import_timeout: + timeout += self.provider.image_import_timeout + while True: + try: + with self.rate_limiter: + import_image_task = self.ec2_client.import_image( + Architecture=provider_image.architecture, + DiskContainers=[{ + 'Format': image_format, + 'UserBucket': { + 'S3Bucket': bucket_name, + 'S3Key': object_filename, + }, + }], + TagSpecifications=[ + { + 'ResourceType': 'import-image-task', + 'Tags': tag_dict_to_list(metadata), + }, + ] + ) + break + except botocore.exceptions.ClientError as error: + if (error.response['Error']['Code'] == + 'ResourceCountLimitExceeded'): + if time.time() < timeout: + self.log.warning("AWS error: '%s' will retry", + str(error)) + time.sleep(self.IMAGE_UPLOAD_SLEEP) + continue + raise task_id = import_image_task['ImportTaskId'] paginator = self.ec2_client.get_paginator( diff --git a/nodepool/driver/aws/config.py b/nodepool/driver/aws/config.py index 997d3f283..9f5b01eba 100644 --- a/nodepool/driver/aws/config.py +++ b/nodepool/driver/aws/config.py @@ -298,6 +298,8 @@ class AwsProviderConfig(ProviderConfig): self.object_storage = self.provider.get('object-storage') self.image_type = self.provider.get('image-format', 'raw') self.image_name_format = '{image_name}-{timestamp}' + self.image_import_timeout = self.provider.get( + 'image-import-timeout', None) self.post_upload_hook = self.provider.get('post-upload-hook') self.max_servers = self.provider.get('max-servers', math.inf) self.max_cores = self.provider.get('max-cores', math.inf) @@ -347,6 +349,7 @@ class AwsProviderConfig(ProviderConfig): 'launch-retries': int, 'object-storage': object_storage, 'image-format': v.Any('ova', 'vhd', 'vhdx', 'vmdk', 'raw'), + 'image-import-timeout': int, 'max-servers': int, 'max-cores': int, 'max-ram': int, diff --git a/nodepool/tests/fixtures/aws/diskimage-import-image.yaml b/nodepool/tests/fixtures/aws/diskimage-import-image.yaml index 4c55a93dc..14bf070b3 100644 --- a/nodepool/tests/fixtures/aws/diskimage-import-image.yaml +++ b/nodepool/tests/fixtures/aws/diskimage-import-image.yaml @@ -27,6 +27,7 @@ providers: region-name: us-west-2 object-storage: bucket-name: nodepool + image-import-timeout: 60 diskimages: - name: fake-image tags: diff --git a/nodepool/tests/fixtures/aws/diskimage.yaml b/nodepool/tests/fixtures/aws/diskimage.yaml index 14c9a9748..16508570f 100644 --- a/nodepool/tests/fixtures/aws/diskimage.yaml +++ b/nodepool/tests/fixtures/aws/diskimage.yaml @@ -27,6 +27,7 @@ providers: region-name: us-west-2 object-storage: bucket-name: nodepool + image-import-timeout: 60 diskimages: - name: fake-image tags: diff --git a/nodepool/tests/unit/fake_aws.py b/nodepool/tests/unit/fake_aws.py index 15617abcc..a7e2238d0 100644 --- a/nodepool/tests/unit/fake_aws.py +++ b/nodepool/tests/unit/fake_aws.py @@ -16,6 +16,7 @@ import logging import uuid +import botocore import boto3 @@ -136,8 +137,14 @@ class FakeAws: self.tasks = {} self.ec2 = boto3.resource('ec2', region_name='us-west-2') self.ec2_client = boto3.client('ec2', region_name='us-west-2') + self.fail_import_count = 0 def import_snapshot(self, *args, **kw): + while self.fail_import_count: + self.fail_import_count -= 1 + raise botocore.exceptions.ClientError( + {'Error': {'Code': 'ResourceCountLimitExceeded'}}, + 'ImportSnapshot') task_id = uuid.uuid4().hex task = make_import_snapshot_stage_1( task_id, @@ -162,6 +169,11 @@ class FakeAws: return snap_id def import_image(self, *args, **kw): + while self.fail_import_count: + self.fail_import_count -= 1 + raise botocore.exceptions.ClientError( + {'Error': {'Code': 'ResourceCountLimitExceeded'}}, + 'ImportImage') task_id = uuid.uuid4().hex task = make_import_image_stage_1( task_id, diff --git a/nodepool/tests/unit/test_driver_aws.py b/nodepool/tests/unit/test_driver_aws.py index 7a6a0ff97..8e6ed7226 100644 --- a/nodepool/tests/unit/test_driver_aws.py +++ b/nodepool/tests/unit/test_driver_aws.py @@ -651,6 +651,7 @@ class TestDriverAws(tests.DBTestCase): self.assertTrue(response['EbsOptimized']['Value']) def test_aws_diskimage_snapshot(self): + self.fake_aws.fail_import_count = 1 configfile = self.setup_config('aws/diskimage.yaml') self.useBuilder(configfile) @@ -693,6 +694,7 @@ class TestDriverAws(tests.DBTestCase): ['Throughput'], 200) def test_aws_diskimage_image(self): + self.fake_aws.fail_import_count = 1 configfile = self.setup_config('aws/diskimage-import-image.yaml') self.useBuilder(configfile) diff --git a/releasenotes/notes/aws-image-import-timeout-d53aaa1a921f7aa5.yaml b/releasenotes/notes/aws-image-import-timeout-d53aaa1a921f7aa5.yaml new file mode 100644 index 000000000..2a83fdd4a --- /dev/null +++ b/releasenotes/notes/aws-image-import-timeout-d53aaa1a921f7aa5.yaml @@ -0,0 +1,7 @@ +--- +features: + - | + The AWS driver now supports an + :attr:`providers.[aws].image-import-timeout` option to control + automatic retries and timeouts when AWS import task resource + limits are reached.