Amazon EC2 Spot support
This adds support for launching Amazon EC2 Spot instances (https://aws.amazon.com/ec2/spot/), which comes with huge cost saving opportunities. Amazon EC2 Spot instances are spare Amazon EC2 capacity, you can get with an discount of up to 90% compared to on-demand pricing. In contrast to on-demand instances, Spot instances can be relaimed with a 2 minute notification in advance (https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-interruptions.html). When :attr:`providers.[aws].pools.labels.use-spot` is set to True, the AWS driver will launch Spot instances. If an instance get interrupted, it will be terminated and no replacement instance will be launched. Change-Id: I9868d014991d78e7b2421439403ae1371b33524c
This commit is contained in:
parent
0e7de19664
commit
36dbff84ba
@ -66,6 +66,7 @@ Selecting the ``aws`` driver adds the following options to the
|
|||||||
cloud-image: debian9
|
cloud-image: debian9
|
||||||
instance-type: t3.large
|
instance-type: t3.large
|
||||||
key-name: zuul
|
key-name: zuul
|
||||||
|
use-spot: True
|
||||||
tags:
|
tags:
|
||||||
key1: value1
|
key1: value1
|
||||||
key2: value2
|
key2: value2
|
||||||
@ -741,6 +742,30 @@ Selecting the ``aws`` driver adds the following options to the
|
|||||||
dynamic-tags:
|
dynamic-tags:
|
||||||
request_info: "Created for request {request.id}"
|
request_info: "Created for request {request.id}"
|
||||||
|
|
||||||
|
.. attr:: use-spot
|
||||||
|
:type: bool
|
||||||
|
:default: False
|
||||||
|
|
||||||
|
When set to True, Nodepool will try to launch an Amazon EC2 Spot
|
||||||
|
instance, instead of an On-Demand instance. Spot instances let
|
||||||
|
you take advantage of unused EC2 capacity at a discount.
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
labels:
|
||||||
|
- name: frugal
|
||||||
|
use-spot: True
|
||||||
|
|
||||||
|
.. note:: As Amazon EC2 Spot instances take advantage of unused
|
||||||
|
EC2 capacity, you may not get an instance, if demand
|
||||||
|
is high. In addition, Amazon EC2 may interrupt your
|
||||||
|
Spot instance and reclaim it with a two minutes warning
|
||||||
|
upfront. Therefore, you might want to setup alternative
|
||||||
|
nodesets as fallback.
|
||||||
|
|
||||||
|
|
||||||
.. _`EBS volume type`: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EBSVolumeTypes.html
|
.. _`EBS volume type`: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EBSVolumeTypes.html
|
||||||
.. _`AWS region`: https://docs.aws.amazon.com/general/latest/gr/rande.html
|
.. _`AWS region`: https://docs.aws.amazon.com/general/latest/gr/rande.html
|
||||||
.. _`Boto configuration`: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html
|
.. _`Boto configuration`: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html
|
||||||
|
@ -60,26 +60,31 @@ def tag_list_to_dict(taglist):
|
|||||||
# https://aws.amazon.com/ec2/instance-types/high-memory/
|
# https://aws.amazon.com/ec2/instance-types/high-memory/
|
||||||
|
|
||||||
QUOTA_CODES = {
|
QUOTA_CODES = {
|
||||||
'a': 'L-1216C47A',
|
# INSTANCE FAMILY: [ON-DEMAND, SPOT]
|
||||||
'c': 'L-1216C47A',
|
'a': ['L-1216C47A', 'L-34B43A08'],
|
||||||
'd': 'L-1216C47A',
|
'c': ['L-1216C47A', 'L-34B43A08'],
|
||||||
'h': 'L-1216C47A',
|
'd': ['L-1216C47A', 'L-34B43A08'],
|
||||||
'i': 'L-1216C47A',
|
'h': ['L-1216C47A', 'L-34B43A08'],
|
||||||
'm': 'L-1216C47A',
|
'i': ['L-1216C47A', 'L-34B43A08'],
|
||||||
'r': 'L-1216C47A',
|
'm': ['L-1216C47A', 'L-34B43A08'],
|
||||||
't': 'L-1216C47A',
|
'r': ['L-1216C47A', 'L-34B43A08'],
|
||||||
'z': 'L-1216C47A',
|
't': ['L-1216C47A', 'L-34B43A08'],
|
||||||
'dl': 'L-6E869C2A',
|
'z': ['L-1216C47A', 'L-34B43A08'],
|
||||||
'f': 'L-74FC7D96',
|
'dl': ['L-6E869C2A', 'L-85EED4F7'],
|
||||||
'g': 'L-DB2E81BA',
|
'f': ['L-74FC7D96', 'L-88CF9481'],
|
||||||
'vt': 'L-DB2E81BA',
|
'g': ['L-DB2E81BA', 'L-3819A6DF'],
|
||||||
'u-': 'L-43DA4232', # 'high memory'
|
'vt': ['L-DB2E81BA', 'L-3819A6DF'],
|
||||||
'inf': 'L-1945791B',
|
'u-': ['L-43DA4232', ''], # 'high memory'
|
||||||
'p': 'L-417A185B',
|
'inf': ['L-1945791B', 'L-B5D1601B'],
|
||||||
'x': 'L-7295265B',
|
'p': ['L-417A185B', 'L-7212CCBC'],
|
||||||
|
'x': ['L-7295265B', 'L-E3A00192'],
|
||||||
|
'trn': ['L-2C3B7624', 'L-6B0D517C'],
|
||||||
|
'hpc': ['L-F7808C92', '']
|
||||||
}
|
}
|
||||||
|
|
||||||
CACHE_TTL = 10
|
CACHE_TTL = 10
|
||||||
|
ON_DEMAND = 0
|
||||||
|
SPOT = 1
|
||||||
|
|
||||||
|
|
||||||
class AwsInstance(statemachine.Instance):
|
class AwsInstance(statemachine.Instance):
|
||||||
@ -183,7 +188,8 @@ class AwsCreateStateMachine(statemachine.StateMachine):
|
|||||||
return
|
return
|
||||||
self.instance = instance
|
self.instance = instance
|
||||||
self.quota = self.adapter._getQuotaForInstanceType(
|
self.quota = self.adapter._getQuotaForInstanceType(
|
||||||
self.instance.instance_type)
|
self.instance.instance_type,
|
||||||
|
SPOT if self.label.use_spot else ON_DEMAND)
|
||||||
self.state = self.INSTANCE_CREATING
|
self.state = self.INSTANCE_CREATING
|
||||||
|
|
||||||
if self.state == self.INSTANCE_CREATING:
|
if self.state == self.INSTANCE_CREATING:
|
||||||
@ -360,35 +366,45 @@ class AwsAdapter(statemachine.Adapter):
|
|||||||
for instance in self._listInstances():
|
for instance in self._listInstances():
|
||||||
if instance.state["Name"].lower() == "terminated":
|
if instance.state["Name"].lower() == "terminated":
|
||||||
continue
|
continue
|
||||||
quota = self._getQuotaForInstanceType(instance.instance_type)
|
quota = self._getQuotaForInstanceType(
|
||||||
|
instance.instance_type,
|
||||||
|
SPOT if instance.instance_lifecycle == 'spot' else ON_DEMAND)
|
||||||
yield AwsInstance(self.provider, instance, quota)
|
yield AwsInstance(self.provider, instance, quota)
|
||||||
|
|
||||||
def getQuotaLimits(self):
|
def getQuotaLimits(self):
|
||||||
# Get the instance types that this provider handles
|
# Get the instance types that this provider handles
|
||||||
instance_types = set()
|
instance_types = {}
|
||||||
for pool in self.provider.pools.values():
|
for pool in self.provider.pools.values():
|
||||||
for label in pool.labels.values():
|
for label in pool.labels.values():
|
||||||
instance_types.add(label.instance_type)
|
if label.instance_type not in instance_types:
|
||||||
|
instance_types[label.instance_type] = set()
|
||||||
|
instance_types[label.instance_type].add(
|
||||||
|
SPOT if label.use_spot else ON_DEMAND)
|
||||||
args = dict(default=math.inf)
|
args = dict(default=math.inf)
|
||||||
for instance_type in instance_types:
|
for instance_type in instance_types:
|
||||||
code = self._getQuotaCodeForInstanceType(instance_type)
|
for market_type_option in instance_types[instance_type]:
|
||||||
if code in args:
|
code = self._getQuotaCodeForInstanceType(instance_type,
|
||||||
continue
|
market_type_option)
|
||||||
if not code:
|
if code in args:
|
||||||
self.log.warning("Unknown quota code for instance type: %s",
|
continue
|
||||||
instance_type)
|
if not code:
|
||||||
continue
|
self.log.warning(
|
||||||
with self.non_mutating_rate_limiter:
|
"Unknown quota code for instance type: %s",
|
||||||
self.log.debug("Getting quota limits for %s", code)
|
instance_type)
|
||||||
response = self.aws_quotas.get_service_quota(
|
continue
|
||||||
ServiceCode='ec2',
|
with self.non_mutating_rate_limiter:
|
||||||
QuotaCode=code,
|
self.log.debug("Getting quota limits for %s", code)
|
||||||
)
|
response = self.aws_quotas.get_service_quota(
|
||||||
args[code] = response['Quota']['Value']
|
ServiceCode='ec2',
|
||||||
|
QuotaCode=code,
|
||||||
|
)
|
||||||
|
args[code] = response['Quota']['Value']
|
||||||
return QuotaInformation(**args)
|
return QuotaInformation(**args)
|
||||||
|
|
||||||
def getQuotaForLabel(self, label):
|
def getQuotaForLabel(self, label):
|
||||||
return self._getQuotaForInstanceType(label.instance_type)
|
return self._getQuotaForInstanceType(
|
||||||
|
label.instance_type,
|
||||||
|
SPOT if label.use_spot else ON_DEMAND)
|
||||||
|
|
||||||
def uploadImage(self, provider_image, image_name, filename,
|
def uploadImage(self, provider_image, image_name, filename,
|
||||||
image_format, metadata, md5, sha256):
|
image_format, metadata, md5, sha256):
|
||||||
@ -753,18 +769,19 @@ class AwsAdapter(statemachine.Adapter):
|
|||||||
|
|
||||||
instance_key_re = re.compile(r'([a-z\-]+)\d.*')
|
instance_key_re = re.compile(r'([a-z\-]+)\d.*')
|
||||||
|
|
||||||
def _getQuotaCodeForInstanceType(self, instance_type):
|
def _getQuotaCodeForInstanceType(self, instance_type, market_type_option):
|
||||||
m = self.instance_key_re.match(instance_type)
|
m = self.instance_key_re.match(instance_type)
|
||||||
if m:
|
if m:
|
||||||
key = m.group(1)
|
key = m.group(1)
|
||||||
return QUOTA_CODES.get(key)
|
return QUOTA_CODES.get(key)[market_type_option]
|
||||||
|
|
||||||
def _getQuotaForInstanceType(self, instance_type):
|
def _getQuotaForInstanceType(self, instance_type, market_type_option):
|
||||||
itype = self._getInstanceType(instance_type)
|
itype = self._getInstanceType(instance_type)
|
||||||
cores = itype['InstanceTypes'][0]['VCpuInfo']['DefaultCores']
|
cores = itype['InstanceTypes'][0]['VCpuInfo']['DefaultCores']
|
||||||
vcpus = itype['InstanceTypes'][0]['VCpuInfo']['DefaultVCpus']
|
vcpus = itype['InstanceTypes'][0]['VCpuInfo']['DefaultVCpus']
|
||||||
ram = itype['InstanceTypes'][0]['MemoryInfo']['SizeInMiB']
|
ram = itype['InstanceTypes'][0]['MemoryInfo']['SizeInMiB']
|
||||||
code = self._getQuotaCodeForInstanceType(instance_type)
|
code = self._getQuotaCodeForInstanceType(instance_type,
|
||||||
|
market_type_option)
|
||||||
# We include cores to match the overall cores quota (which may
|
# We include cores to match the overall cores quota (which may
|
||||||
# be set as a tenant resource limit), and include vCPUs for the
|
# be set as a tenant resource limit), and include vCPUs for the
|
||||||
# specific AWS quota code which in for a specific instance
|
# specific AWS quota code which in for a specific instance
|
||||||
@ -967,6 +984,16 @@ class AwsAdapter(statemachine.Adapter):
|
|||||||
del mapping['Ebs']['Encrypted']
|
del mapping['Ebs']['Encrypted']
|
||||||
args['BlockDeviceMappings'] = [mapping]
|
args['BlockDeviceMappings'] = [mapping]
|
||||||
|
|
||||||
|
# enable EC2 Spot
|
||||||
|
if label.use_spot:
|
||||||
|
args['InstanceMarketOptions'] = {
|
||||||
|
'MarketType': 'spot',
|
||||||
|
'SpotOptions': {
|
||||||
|
'SpotInstanceType': 'one-time',
|
||||||
|
'InstanceInterruptionBehavior': 'terminate'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
with self.rate_limiter(log.debug, "Created instance"):
|
with self.rate_limiter(log.debug, "Created instance"):
|
||||||
log.debug(f"Creating VM {hostname}")
|
log.debug(f"Creating VM {hostname}")
|
||||||
instances = self.ec2.create_instances(**args)
|
instances = self.ec2.create_instances(**args)
|
||||||
|
@ -179,6 +179,7 @@ class AwsLabel(ConfigValue):
|
|||||||
self.tags = label.get('tags', {})
|
self.tags = label.get('tags', {})
|
||||||
self.dynamic_tags = label.get('dynamic-tags', {})
|
self.dynamic_tags = label.get('dynamic-tags', {})
|
||||||
self.host_key_checking = self.pool.host_key_checking
|
self.host_key_checking = self.pool.host_key_checking
|
||||||
|
self.use_spot = bool(label.get('use-spot', False))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def getSchema():
|
def getSchema():
|
||||||
@ -200,6 +201,7 @@ class AwsLabel(ConfigValue):
|
|||||||
},
|
},
|
||||||
'tags': dict,
|
'tags': dict,
|
||||||
'dynamic-tags': dict,
|
'dynamic-tags': dict,
|
||||||
|
'use-spot': bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
11
nodepool/tests/fixtures/aws/aws-quota.yaml
vendored
11
nodepool/tests/fixtures/aws/aws-quota.yaml
vendored
@ -15,6 +15,8 @@ tenant-resource-limits:
|
|||||||
labels:
|
labels:
|
||||||
- name: standard
|
- name: standard
|
||||||
- name: high
|
- name: high
|
||||||
|
- name: spot
|
||||||
|
- name: on-demand
|
||||||
|
|
||||||
providers:
|
providers:
|
||||||
- name: ec2-us-west-2
|
- name: ec2-us-west-2
|
||||||
@ -41,3 +43,12 @@ providers:
|
|||||||
cloud-image: ubuntu1404
|
cloud-image: ubuntu1404
|
||||||
instance-type: u-6tb1.112xlarge
|
instance-type: u-6tb1.112xlarge
|
||||||
key-name: zuul
|
key-name: zuul
|
||||||
|
- name: spot
|
||||||
|
cloud-image: ubuntu1404
|
||||||
|
instance-type: m6i.32xlarge
|
||||||
|
key-name: zuul
|
||||||
|
use-spot: True
|
||||||
|
- name: on-demand
|
||||||
|
cloud-image: ubuntu1404
|
||||||
|
instance-type: m6i.32xlarge
|
||||||
|
key-name: zuul
|
||||||
|
39
nodepool/tests/fixtures/aws/aws-spot.yaml
vendored
Normal file
39
nodepool/tests/fixtures/aws/aws-spot.yaml
vendored
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
zookeeper-servers:
|
||||||
|
- host: {zookeeper_host}
|
||||||
|
port: {zookeeper_port}
|
||||||
|
chroot: {zookeeper_chroot}
|
||||||
|
|
||||||
|
zookeeper-tls:
|
||||||
|
ca: {zookeeper_ca}
|
||||||
|
cert: {zookeeper_cert}
|
||||||
|
key: {zookeeper_key}
|
||||||
|
|
||||||
|
tenant-resource-limits:
|
||||||
|
- tenant-name: tenant-1
|
||||||
|
max-cores: 1024
|
||||||
|
|
||||||
|
labels:
|
||||||
|
- name: ubuntu1404-spot
|
||||||
|
|
||||||
|
providers:
|
||||||
|
- name: ec2-us-west-2
|
||||||
|
driver: aws
|
||||||
|
region-name: us-west-2
|
||||||
|
cloud-images:
|
||||||
|
- name: ubuntu1404
|
||||||
|
image-id: ami-1e749f67
|
||||||
|
username: ubuntu
|
||||||
|
pools:
|
||||||
|
- name: main
|
||||||
|
max-servers: 10
|
||||||
|
subnet-id: {subnet_id}
|
||||||
|
security-group-id: {security_group_id}
|
||||||
|
node-attributes:
|
||||||
|
key1: value1
|
||||||
|
key2: value2
|
||||||
|
labels:
|
||||||
|
- name: ubuntu1404-spot
|
||||||
|
cloud-image: ubuntu1404
|
||||||
|
instance-type: t3.medium
|
||||||
|
key-name: zuul-spot
|
||||||
|
use-spot: True
|
@ -244,6 +244,7 @@ class TestDriverAws(tests.DBTestCase):
|
|||||||
@aws_quotas({
|
@aws_quotas({
|
||||||
'L-1216C47A': 2,
|
'L-1216C47A': 2,
|
||||||
'L-43DA4232': 448,
|
'L-43DA4232': 448,
|
||||||
|
'L-34B43A08': 2
|
||||||
})
|
})
|
||||||
def test_aws_multi_quota(self):
|
def test_aws_multi_quota(self):
|
||||||
# Test multiple instance type quotas (standard and high-mem)
|
# Test multiple instance type quotas (standard and high-mem)
|
||||||
@ -295,6 +296,59 @@ class TestDriverAws(tests.DBTestCase):
|
|||||||
req3 = self.waitForNodeRequest(req3)
|
req3 = self.waitForNodeRequest(req3)
|
||||||
self.assertSuccess(req3)
|
self.assertSuccess(req3)
|
||||||
|
|
||||||
|
@aws_quotas({
|
||||||
|
'L-43DA4232': 448,
|
||||||
|
'L-1216C47A': 200,
|
||||||
|
'L-34B43A08': 200
|
||||||
|
})
|
||||||
|
def test_aws_multi_quota_spot(self):
|
||||||
|
# Test multiple instance type quotas (standard, high-mem and spot)
|
||||||
|
configfile = self.setup_config('aws/aws-quota.yaml')
|
||||||
|
pool = self.useNodepool(configfile, watermark_sleep=1)
|
||||||
|
pool.start()
|
||||||
|
|
||||||
|
# Create a spot node request which should succeed.
|
||||||
|
req1 = zk.NodeRequest()
|
||||||
|
req1.state = zk.REQUESTED
|
||||||
|
req1.node_types.append('spot')
|
||||||
|
self.zk.storeNodeRequest(req1)
|
||||||
|
self.log.debug("Waiting for request %s", req1.id)
|
||||||
|
req1 = self.waitForNodeRequest(req1)
|
||||||
|
node1 = self.assertSuccess(req1)
|
||||||
|
|
||||||
|
# Create an on-demand node request which should succeed.
|
||||||
|
req2 = zk.NodeRequest()
|
||||||
|
req2.state = zk.REQUESTED
|
||||||
|
req2.node_types.append('on-demand')
|
||||||
|
self.zk.storeNodeRequest(req2)
|
||||||
|
self.log.debug("Waiting for request %s", req2.id)
|
||||||
|
req2 = self.waitForNodeRequest(req2)
|
||||||
|
self.assertSuccess(req2)
|
||||||
|
|
||||||
|
# Create another spot node request which should be paused.
|
||||||
|
req3 = zk.NodeRequest()
|
||||||
|
req3.state = zk.REQUESTED
|
||||||
|
req3.node_types.append('spot')
|
||||||
|
self.zk.storeNodeRequest(req3)
|
||||||
|
self.log.debug("Waiting for request %s", req3.id)
|
||||||
|
req3 = self.waitForNodeRequest(req3, (zk.PENDING,))
|
||||||
|
|
||||||
|
# Make sure we're paused while we attempt to fulfill the
|
||||||
|
# third request.
|
||||||
|
pool_worker = pool.getPoolWorkers('ec2-us-west-2')
|
||||||
|
for _ in iterate_timeout(30, Exception, 'paused handler'):
|
||||||
|
if pool_worker[0].paused_handlers:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Release the first spot node so that the third can be fulfilled.
|
||||||
|
node1.state = zk.USED
|
||||||
|
self.zk.storeNode(node1)
|
||||||
|
self.waitForNodeDeletion(node1)
|
||||||
|
|
||||||
|
# Make sure the fourth spot node exists now.
|
||||||
|
req3 = self.waitForNodeRequest(req3)
|
||||||
|
self.assertSuccess(req3)
|
||||||
|
|
||||||
@aws_quotas({
|
@aws_quotas({
|
||||||
'L-1216C47A': 1000,
|
'L-1216C47A': 1000,
|
||||||
'L-43DA4232': 1000,
|
'L-43DA4232': 1000,
|
||||||
@ -916,3 +970,12 @@ class TestDriverAws(tests.DBTestCase):
|
|||||||
except botocore.exceptions.ClientError:
|
except botocore.exceptions.ClientError:
|
||||||
# Probably not found
|
# Probably not found
|
||||||
break
|
break
|
||||||
|
|
||||||
|
def test_aws_provisioning_spot_instances(self):
|
||||||
|
# Test creating a spot instances instead of an on-demand on.
|
||||||
|
req = self.requestNode('aws/aws-spot.yaml', 'ubuntu1404-spot')
|
||||||
|
node = self.assertSuccess(req)
|
||||||
|
instance = self.ec2.Instance(node.external_id)
|
||||||
|
self.assertEqual(instance.instance_lifecycle, 'spot')
|
||||||
|
# moto doesn't provide the spot_instance_request_id
|
||||||
|
# self.assertIsNotNone(instance.spot_instance_request_id)
|
||||||
|
6
releasenotes/notes/aws-spot-9ef3d8ee39fde2b2.yaml
Normal file
6
releasenotes/notes/aws-spot-9ef3d8ee39fde2b2.yaml
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
---
|
||||||
|
features:
|
||||||
|
- |
|
||||||
|
The AWS driver now supports launching Amazon EC2 Spot instances
|
||||||
|
(https://aws.amazon.com/ec2/spot/), when specifying
|
||||||
|
:attr:`providers.[aws].pools.labels.use-spot`.
|
Loading…
Reference in New Issue
Block a user