Merge "AWS: Add support for retrying image imports"
This commit is contained in:
commit
785f7dcbc9
@ -186,6 +186,17 @@ Selecting the ``aws`` driver adds the following options to the
|
|||||||
``ova``, ``vhd``, ``vhdx``, ``vmdk``, ``raw`` (not all of which
|
``ova``, ``vhd``, ``vhdx``, ``vmdk``, ``raw`` (not all of which
|
||||||
are supported by diskimage-builder).
|
are supported by diskimage-builder).
|
||||||
|
|
||||||
|
.. attr:: image-import-timeout
|
||||||
|
:type: int
|
||||||
|
|
||||||
|
Generally there is no limit on the amount of time a successful
|
||||||
|
image import can take. However, some import tasks may encounter
|
||||||
|
temporary resource limitations from AWS. In these cases, if
|
||||||
|
this value is set, Nodepool will retry the import tasks until
|
||||||
|
the timeout is reached. If this is unset (the default), then
|
||||||
|
the first resource limitation detected will result in an error.
|
||||||
|
The value is in seconds.
|
||||||
|
|
||||||
.. attr:: cloud-images
|
.. attr:: cloud-images
|
||||||
:type: list
|
:type: list
|
||||||
|
|
||||||
|
@ -478,6 +478,11 @@ class AwsAdapter(statemachine.Adapter):
|
|||||||
bucket_name, object_filename):
|
bucket_name, object_filename):
|
||||||
# Import snapshot
|
# Import snapshot
|
||||||
self.log.debug(f"Importing {image_name} as snapshot")
|
self.log.debug(f"Importing {image_name} as snapshot")
|
||||||
|
timeout = time.time()
|
||||||
|
if self.provider.image_import_timeout:
|
||||||
|
timeout += self.provider.image_import_timeout
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
with self.rate_limiter:
|
with self.rate_limiter:
|
||||||
import_snapshot_task = self.ec2_client.import_snapshot(
|
import_snapshot_task = self.ec2_client.import_snapshot(
|
||||||
DiskContainer={
|
DiskContainer={
|
||||||
@ -494,6 +499,16 @@ class AwsAdapter(statemachine.Adapter):
|
|||||||
},
|
},
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
break
|
||||||
|
except botocore.exceptions.ClientError as error:
|
||||||
|
if (error.response['Error']['Code'] ==
|
||||||
|
'ResourceCountLimitExceeded'):
|
||||||
|
if time.time() < timeout:
|
||||||
|
self.log.warning("AWS error: '%s' will retry",
|
||||||
|
str(error))
|
||||||
|
time.sleep(self.IMAGE_UPLOAD_SLEEP)
|
||||||
|
continue
|
||||||
|
raise
|
||||||
task_id = import_snapshot_task['ImportTaskId']
|
task_id = import_snapshot_task['ImportTaskId']
|
||||||
|
|
||||||
paginator = self.ec2_client.get_paginator(
|
paginator = self.ec2_client.get_paginator(
|
||||||
@ -571,6 +586,11 @@ class AwsAdapter(statemachine.Adapter):
|
|||||||
bucket_name, object_filename):
|
bucket_name, object_filename):
|
||||||
# Import image as AMI
|
# Import image as AMI
|
||||||
self.log.debug(f"Importing {image_name} as AMI")
|
self.log.debug(f"Importing {image_name} as AMI")
|
||||||
|
timeout = time.time()
|
||||||
|
if self.provider.image_import_timeout:
|
||||||
|
timeout += self.provider.image_import_timeout
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
with self.rate_limiter:
|
with self.rate_limiter:
|
||||||
import_image_task = self.ec2_client.import_image(
|
import_image_task = self.ec2_client.import_image(
|
||||||
Architecture=provider_image.architecture,
|
Architecture=provider_image.architecture,
|
||||||
@ -588,6 +608,16 @@ class AwsAdapter(statemachine.Adapter):
|
|||||||
},
|
},
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
break
|
||||||
|
except botocore.exceptions.ClientError as error:
|
||||||
|
if (error.response['Error']['Code'] ==
|
||||||
|
'ResourceCountLimitExceeded'):
|
||||||
|
if time.time() < timeout:
|
||||||
|
self.log.warning("AWS error: '%s' will retry",
|
||||||
|
str(error))
|
||||||
|
time.sleep(self.IMAGE_UPLOAD_SLEEP)
|
||||||
|
continue
|
||||||
|
raise
|
||||||
task_id = import_image_task['ImportTaskId']
|
task_id = import_image_task['ImportTaskId']
|
||||||
|
|
||||||
paginator = self.ec2_client.get_paginator(
|
paginator = self.ec2_client.get_paginator(
|
||||||
|
@ -298,6 +298,8 @@ class AwsProviderConfig(ProviderConfig):
|
|||||||
self.object_storage = self.provider.get('object-storage')
|
self.object_storage = self.provider.get('object-storage')
|
||||||
self.image_type = self.provider.get('image-format', 'raw')
|
self.image_type = self.provider.get('image-format', 'raw')
|
||||||
self.image_name_format = '{image_name}-{timestamp}'
|
self.image_name_format = '{image_name}-{timestamp}'
|
||||||
|
self.image_import_timeout = self.provider.get(
|
||||||
|
'image-import-timeout', None)
|
||||||
self.post_upload_hook = self.provider.get('post-upload-hook')
|
self.post_upload_hook = self.provider.get('post-upload-hook')
|
||||||
self.max_servers = self.provider.get('max-servers', math.inf)
|
self.max_servers = self.provider.get('max-servers', math.inf)
|
||||||
self.max_cores = self.provider.get('max-cores', math.inf)
|
self.max_cores = self.provider.get('max-cores', math.inf)
|
||||||
@ -347,6 +349,7 @@ class AwsProviderConfig(ProviderConfig):
|
|||||||
'launch-retries': int,
|
'launch-retries': int,
|
||||||
'object-storage': object_storage,
|
'object-storage': object_storage,
|
||||||
'image-format': v.Any('ova', 'vhd', 'vhdx', 'vmdk', 'raw'),
|
'image-format': v.Any('ova', 'vhd', 'vhdx', 'vmdk', 'raw'),
|
||||||
|
'image-import-timeout': int,
|
||||||
'max-servers': int,
|
'max-servers': int,
|
||||||
'max-cores': int,
|
'max-cores': int,
|
||||||
'max-ram': int,
|
'max-ram': int,
|
||||||
|
@ -27,6 +27,7 @@ providers:
|
|||||||
region-name: us-west-2
|
region-name: us-west-2
|
||||||
object-storage:
|
object-storage:
|
||||||
bucket-name: nodepool
|
bucket-name: nodepool
|
||||||
|
image-import-timeout: 60
|
||||||
diskimages:
|
diskimages:
|
||||||
- name: fake-image
|
- name: fake-image
|
||||||
tags:
|
tags:
|
||||||
|
1
nodepool/tests/fixtures/aws/diskimage.yaml
vendored
1
nodepool/tests/fixtures/aws/diskimage.yaml
vendored
@ -27,6 +27,7 @@ providers:
|
|||||||
region-name: us-west-2
|
region-name: us-west-2
|
||||||
object-storage:
|
object-storage:
|
||||||
bucket-name: nodepool
|
bucket-name: nodepool
|
||||||
|
image-import-timeout: 60
|
||||||
diskimages:
|
diskimages:
|
||||||
- name: fake-image
|
- name: fake-image
|
||||||
tags:
|
tags:
|
||||||
|
@ -16,6 +16,7 @@
|
|||||||
import logging
|
import logging
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
|
import botocore
|
||||||
import boto3
|
import boto3
|
||||||
|
|
||||||
|
|
||||||
@ -136,8 +137,14 @@ class FakeAws:
|
|||||||
self.tasks = {}
|
self.tasks = {}
|
||||||
self.ec2 = boto3.resource('ec2', region_name='us-west-2')
|
self.ec2 = boto3.resource('ec2', region_name='us-west-2')
|
||||||
self.ec2_client = boto3.client('ec2', region_name='us-west-2')
|
self.ec2_client = boto3.client('ec2', region_name='us-west-2')
|
||||||
|
self.fail_import_count = 0
|
||||||
|
|
||||||
def import_snapshot(self, *args, **kw):
|
def import_snapshot(self, *args, **kw):
|
||||||
|
while self.fail_import_count:
|
||||||
|
self.fail_import_count -= 1
|
||||||
|
raise botocore.exceptions.ClientError(
|
||||||
|
{'Error': {'Code': 'ResourceCountLimitExceeded'}},
|
||||||
|
'ImportSnapshot')
|
||||||
task_id = uuid.uuid4().hex
|
task_id = uuid.uuid4().hex
|
||||||
task = make_import_snapshot_stage_1(
|
task = make_import_snapshot_stage_1(
|
||||||
task_id,
|
task_id,
|
||||||
@ -162,6 +169,11 @@ class FakeAws:
|
|||||||
return snap_id
|
return snap_id
|
||||||
|
|
||||||
def import_image(self, *args, **kw):
|
def import_image(self, *args, **kw):
|
||||||
|
while self.fail_import_count:
|
||||||
|
self.fail_import_count -= 1
|
||||||
|
raise botocore.exceptions.ClientError(
|
||||||
|
{'Error': {'Code': 'ResourceCountLimitExceeded'}},
|
||||||
|
'ImportImage')
|
||||||
task_id = uuid.uuid4().hex
|
task_id = uuid.uuid4().hex
|
||||||
task = make_import_image_stage_1(
|
task = make_import_image_stage_1(
|
||||||
task_id,
|
task_id,
|
||||||
|
@ -710,6 +710,7 @@ class TestDriverAws(tests.DBTestCase):
|
|||||||
self.assertTrue(response['EbsOptimized']['Value'])
|
self.assertTrue(response['EbsOptimized']['Value'])
|
||||||
|
|
||||||
def test_aws_diskimage_snapshot(self):
|
def test_aws_diskimage_snapshot(self):
|
||||||
|
self.fake_aws.fail_import_count = 1
|
||||||
configfile = self.setup_config('aws/diskimage.yaml')
|
configfile = self.setup_config('aws/diskimage.yaml')
|
||||||
|
|
||||||
self.useBuilder(configfile)
|
self.useBuilder(configfile)
|
||||||
@ -753,6 +754,7 @@ class TestDriverAws(tests.DBTestCase):
|
|||||||
['Throughput'], 200)
|
['Throughput'], 200)
|
||||||
|
|
||||||
def test_aws_diskimage_image(self):
|
def test_aws_diskimage_image(self):
|
||||||
|
self.fake_aws.fail_import_count = 1
|
||||||
configfile = self.setup_config('aws/diskimage-import-image.yaml')
|
configfile = self.setup_config('aws/diskimage-import-image.yaml')
|
||||||
|
|
||||||
self.useBuilder(configfile)
|
self.useBuilder(configfile)
|
||||||
|
@ -0,0 +1,7 @@
|
|||||||
|
---
|
||||||
|
features:
|
||||||
|
- |
|
||||||
|
The AWS driver now supports an
|
||||||
|
:attr:`providers.[aws].image-import-timeout` option to control
|
||||||
|
automatic retries and timeouts when AWS import task resource
|
||||||
|
limits are reached.
|
Loading…
Reference in New Issue
Block a user