Merge "AWS: Add support for retrying image imports"

This commit is contained in:
Zuul 2023-08-28 18:43:56 +00:00 committed by Gerrit Code Review
commit 785f7dcbc9
8 changed files with 100 additions and 33 deletions

View File

@ -186,6 +186,17 @@ Selecting the ``aws`` driver adds the following options to the
``ova``, ``vhd``, ``vhdx``, ``vmdk``, ``raw`` (not all of which ``ova``, ``vhd``, ``vhdx``, ``vmdk``, ``raw`` (not all of which
are supported by diskimage-builder). are supported by diskimage-builder).
.. attr:: image-import-timeout
:type: int
Generally there is no limit on the amount of time a successful
image import can take. However, some import tasks may encounter
temporary resource limitations from AWS. In these cases, if
this value is set, Nodepool will retry the import tasks until
the timeout is reached. If this is unset (the default), then
the first resource limitation detected will result in an error.
The value is in seconds.
.. attr:: cloud-images .. attr:: cloud-images
:type: list :type: list

View File

@ -478,6 +478,11 @@ class AwsAdapter(statemachine.Adapter):
bucket_name, object_filename): bucket_name, object_filename):
# Import snapshot # Import snapshot
self.log.debug(f"Importing {image_name} as snapshot") self.log.debug(f"Importing {image_name} as snapshot")
timeout = time.time()
if self.provider.image_import_timeout:
timeout += self.provider.image_import_timeout
while True:
try:
with self.rate_limiter: with self.rate_limiter:
import_snapshot_task = self.ec2_client.import_snapshot( import_snapshot_task = self.ec2_client.import_snapshot(
DiskContainer={ DiskContainer={
@ -494,6 +499,16 @@ class AwsAdapter(statemachine.Adapter):
}, },
] ]
) )
break
except botocore.exceptions.ClientError as error:
if (error.response['Error']['Code'] ==
'ResourceCountLimitExceeded'):
if time.time() < timeout:
self.log.warning("AWS error: '%s' will retry",
str(error))
time.sleep(self.IMAGE_UPLOAD_SLEEP)
continue
raise
task_id = import_snapshot_task['ImportTaskId'] task_id = import_snapshot_task['ImportTaskId']
paginator = self.ec2_client.get_paginator( paginator = self.ec2_client.get_paginator(
@ -571,6 +586,11 @@ class AwsAdapter(statemachine.Adapter):
bucket_name, object_filename): bucket_name, object_filename):
# Import image as AMI # Import image as AMI
self.log.debug(f"Importing {image_name} as AMI") self.log.debug(f"Importing {image_name} as AMI")
timeout = time.time()
if self.provider.image_import_timeout:
timeout += self.provider.image_import_timeout
while True:
try:
with self.rate_limiter: with self.rate_limiter:
import_image_task = self.ec2_client.import_image( import_image_task = self.ec2_client.import_image(
Architecture=provider_image.architecture, Architecture=provider_image.architecture,
@ -588,6 +608,16 @@ class AwsAdapter(statemachine.Adapter):
}, },
] ]
) )
break
except botocore.exceptions.ClientError as error:
if (error.response['Error']['Code'] ==
'ResourceCountLimitExceeded'):
if time.time() < timeout:
self.log.warning("AWS error: '%s' will retry",
str(error))
time.sleep(self.IMAGE_UPLOAD_SLEEP)
continue
raise
task_id = import_image_task['ImportTaskId'] task_id = import_image_task['ImportTaskId']
paginator = self.ec2_client.get_paginator( paginator = self.ec2_client.get_paginator(

View File

@ -298,6 +298,8 @@ class AwsProviderConfig(ProviderConfig):
self.object_storage = self.provider.get('object-storage') self.object_storage = self.provider.get('object-storage')
self.image_type = self.provider.get('image-format', 'raw') self.image_type = self.provider.get('image-format', 'raw')
self.image_name_format = '{image_name}-{timestamp}' self.image_name_format = '{image_name}-{timestamp}'
self.image_import_timeout = self.provider.get(
'image-import-timeout', None)
self.post_upload_hook = self.provider.get('post-upload-hook') self.post_upload_hook = self.provider.get('post-upload-hook')
self.max_servers = self.provider.get('max-servers', math.inf) self.max_servers = self.provider.get('max-servers', math.inf)
self.max_cores = self.provider.get('max-cores', math.inf) self.max_cores = self.provider.get('max-cores', math.inf)
@ -347,6 +349,7 @@ class AwsProviderConfig(ProviderConfig):
'launch-retries': int, 'launch-retries': int,
'object-storage': object_storage, 'object-storage': object_storage,
'image-format': v.Any('ova', 'vhd', 'vhdx', 'vmdk', 'raw'), 'image-format': v.Any('ova', 'vhd', 'vhdx', 'vmdk', 'raw'),
'image-import-timeout': int,
'max-servers': int, 'max-servers': int,
'max-cores': int, 'max-cores': int,
'max-ram': int, 'max-ram': int,

View File

@ -27,6 +27,7 @@ providers:
region-name: us-west-2 region-name: us-west-2
object-storage: object-storage:
bucket-name: nodepool bucket-name: nodepool
image-import-timeout: 60
diskimages: diskimages:
- name: fake-image - name: fake-image
tags: tags:

View File

@ -27,6 +27,7 @@ providers:
region-name: us-west-2 region-name: us-west-2
object-storage: object-storage:
bucket-name: nodepool bucket-name: nodepool
image-import-timeout: 60
diskimages: diskimages:
- name: fake-image - name: fake-image
tags: tags:

View File

@ -16,6 +16,7 @@
import logging import logging
import uuid import uuid
import botocore
import boto3 import boto3
@ -136,8 +137,14 @@ class FakeAws:
self.tasks = {} self.tasks = {}
self.ec2 = boto3.resource('ec2', region_name='us-west-2') self.ec2 = boto3.resource('ec2', region_name='us-west-2')
self.ec2_client = boto3.client('ec2', region_name='us-west-2') self.ec2_client = boto3.client('ec2', region_name='us-west-2')
self.fail_import_count = 0
def import_snapshot(self, *args, **kw): def import_snapshot(self, *args, **kw):
while self.fail_import_count:
self.fail_import_count -= 1
raise botocore.exceptions.ClientError(
{'Error': {'Code': 'ResourceCountLimitExceeded'}},
'ImportSnapshot')
task_id = uuid.uuid4().hex task_id = uuid.uuid4().hex
task = make_import_snapshot_stage_1( task = make_import_snapshot_stage_1(
task_id, task_id,
@ -162,6 +169,11 @@ class FakeAws:
return snap_id return snap_id
def import_image(self, *args, **kw): def import_image(self, *args, **kw):
while self.fail_import_count:
self.fail_import_count -= 1
raise botocore.exceptions.ClientError(
{'Error': {'Code': 'ResourceCountLimitExceeded'}},
'ImportImage')
task_id = uuid.uuid4().hex task_id = uuid.uuid4().hex
task = make_import_image_stage_1( task = make_import_image_stage_1(
task_id, task_id,

View File

@ -710,6 +710,7 @@ class TestDriverAws(tests.DBTestCase):
self.assertTrue(response['EbsOptimized']['Value']) self.assertTrue(response['EbsOptimized']['Value'])
def test_aws_diskimage_snapshot(self): def test_aws_diskimage_snapshot(self):
self.fake_aws.fail_import_count = 1
configfile = self.setup_config('aws/diskimage.yaml') configfile = self.setup_config('aws/diskimage.yaml')
self.useBuilder(configfile) self.useBuilder(configfile)
@ -753,6 +754,7 @@ class TestDriverAws(tests.DBTestCase):
['Throughput'], 200) ['Throughput'], 200)
def test_aws_diskimage_image(self): def test_aws_diskimage_image(self):
self.fake_aws.fail_import_count = 1
configfile = self.setup_config('aws/diskimage-import-image.yaml') configfile = self.setup_config('aws/diskimage-import-image.yaml')
self.useBuilder(configfile) self.useBuilder(configfile)

View File

@ -0,0 +1,7 @@
---
features:
- |
The AWS driver now supports an
:attr:`providers.[aws].image-import-timeout` option to control
automatic retries and timeouts when AWS import task resource
limits are reached.