Amazon EC2 Spot support
This adds support for launching Amazon EC2 Spot instances (https://aws.amazon.com/ec2/spot/), which comes with huge cost saving opportunities. Amazon EC2 Spot instances are spare Amazon EC2 capacity, you can get with an discount of up to 90% compared to on-demand pricing. In contrast to on-demand instances, Spot instances can be relaimed with a 2 minute notification in advance (https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-interruptions.html). When :attr:`providers.[aws].pools.labels.use-spot` is set to True, the AWS driver will launch Spot instances. If an instance get interrupted, it will be terminated and no replacement instance will be launched. Change-Id: I9868d014991d78e7b2421439403ae1371b33524c
This commit is contained in:
parent
0e7de19664
commit
36dbff84ba
@ -66,6 +66,7 @@ Selecting the ``aws`` driver adds the following options to the
|
||||
cloud-image: debian9
|
||||
instance-type: t3.large
|
||||
key-name: zuul
|
||||
use-spot: True
|
||||
tags:
|
||||
key1: value1
|
||||
key2: value2
|
||||
@ -741,6 +742,30 @@ Selecting the ``aws`` driver adds the following options to the
|
||||
dynamic-tags:
|
||||
request_info: "Created for request {request.id}"
|
||||
|
||||
.. attr:: use-spot
|
||||
:type: bool
|
||||
:default: False
|
||||
|
||||
When set to True, Nodepool will try to launch an Amazon EC2 Spot
|
||||
instance, instead of an On-Demand instance. Spot instances let
|
||||
you take advantage of unused EC2 capacity at a discount.
|
||||
|
||||
For example:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
labels:
|
||||
- name: frugal
|
||||
use-spot: True
|
||||
|
||||
.. note:: As Amazon EC2 Spot instances take advantage of unused
|
||||
EC2 capacity, you may not get an instance, if demand
|
||||
is high. In addition, Amazon EC2 may interrupt your
|
||||
Spot instance and reclaim it with a two minutes warning
|
||||
upfront. Therefore, you might want to setup alternative
|
||||
nodesets as fallback.
|
||||
|
||||
|
||||
.. _`EBS volume type`: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EBSVolumeTypes.html
|
||||
.. _`AWS region`: https://docs.aws.amazon.com/general/latest/gr/rande.html
|
||||
.. _`Boto configuration`: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html
|
||||
|
@ -60,26 +60,31 @@ def tag_list_to_dict(taglist):
|
||||
# https://aws.amazon.com/ec2/instance-types/high-memory/
|
||||
|
||||
QUOTA_CODES = {
|
||||
'a': 'L-1216C47A',
|
||||
'c': 'L-1216C47A',
|
||||
'd': 'L-1216C47A',
|
||||
'h': 'L-1216C47A',
|
||||
'i': 'L-1216C47A',
|
||||
'm': 'L-1216C47A',
|
||||
'r': 'L-1216C47A',
|
||||
't': 'L-1216C47A',
|
||||
'z': 'L-1216C47A',
|
||||
'dl': 'L-6E869C2A',
|
||||
'f': 'L-74FC7D96',
|
||||
'g': 'L-DB2E81BA',
|
||||
'vt': 'L-DB2E81BA',
|
||||
'u-': 'L-43DA4232', # 'high memory'
|
||||
'inf': 'L-1945791B',
|
||||
'p': 'L-417A185B',
|
||||
'x': 'L-7295265B',
|
||||
# INSTANCE FAMILY: [ON-DEMAND, SPOT]
|
||||
'a': ['L-1216C47A', 'L-34B43A08'],
|
||||
'c': ['L-1216C47A', 'L-34B43A08'],
|
||||
'd': ['L-1216C47A', 'L-34B43A08'],
|
||||
'h': ['L-1216C47A', 'L-34B43A08'],
|
||||
'i': ['L-1216C47A', 'L-34B43A08'],
|
||||
'm': ['L-1216C47A', 'L-34B43A08'],
|
||||
'r': ['L-1216C47A', 'L-34B43A08'],
|
||||
't': ['L-1216C47A', 'L-34B43A08'],
|
||||
'z': ['L-1216C47A', 'L-34B43A08'],
|
||||
'dl': ['L-6E869C2A', 'L-85EED4F7'],
|
||||
'f': ['L-74FC7D96', 'L-88CF9481'],
|
||||
'g': ['L-DB2E81BA', 'L-3819A6DF'],
|
||||
'vt': ['L-DB2E81BA', 'L-3819A6DF'],
|
||||
'u-': ['L-43DA4232', ''], # 'high memory'
|
||||
'inf': ['L-1945791B', 'L-B5D1601B'],
|
||||
'p': ['L-417A185B', 'L-7212CCBC'],
|
||||
'x': ['L-7295265B', 'L-E3A00192'],
|
||||
'trn': ['L-2C3B7624', 'L-6B0D517C'],
|
||||
'hpc': ['L-F7808C92', '']
|
||||
}
|
||||
|
||||
CACHE_TTL = 10
|
||||
ON_DEMAND = 0
|
||||
SPOT = 1
|
||||
|
||||
|
||||
class AwsInstance(statemachine.Instance):
|
||||
@ -183,7 +188,8 @@ class AwsCreateStateMachine(statemachine.StateMachine):
|
||||
return
|
||||
self.instance = instance
|
||||
self.quota = self.adapter._getQuotaForInstanceType(
|
||||
self.instance.instance_type)
|
||||
self.instance.instance_type,
|
||||
SPOT if self.label.use_spot else ON_DEMAND)
|
||||
self.state = self.INSTANCE_CREATING
|
||||
|
||||
if self.state == self.INSTANCE_CREATING:
|
||||
@ -360,22 +366,30 @@ class AwsAdapter(statemachine.Adapter):
|
||||
for instance in self._listInstances():
|
||||
if instance.state["Name"].lower() == "terminated":
|
||||
continue
|
||||
quota = self._getQuotaForInstanceType(instance.instance_type)
|
||||
quota = self._getQuotaForInstanceType(
|
||||
instance.instance_type,
|
||||
SPOT if instance.instance_lifecycle == 'spot' else ON_DEMAND)
|
||||
yield AwsInstance(self.provider, instance, quota)
|
||||
|
||||
def getQuotaLimits(self):
|
||||
# Get the instance types that this provider handles
|
||||
instance_types = set()
|
||||
instance_types = {}
|
||||
for pool in self.provider.pools.values():
|
||||
for label in pool.labels.values():
|
||||
instance_types.add(label.instance_type)
|
||||
if label.instance_type not in instance_types:
|
||||
instance_types[label.instance_type] = set()
|
||||
instance_types[label.instance_type].add(
|
||||
SPOT if label.use_spot else ON_DEMAND)
|
||||
args = dict(default=math.inf)
|
||||
for instance_type in instance_types:
|
||||
code = self._getQuotaCodeForInstanceType(instance_type)
|
||||
for market_type_option in instance_types[instance_type]:
|
||||
code = self._getQuotaCodeForInstanceType(instance_type,
|
||||
market_type_option)
|
||||
if code in args:
|
||||
continue
|
||||
if not code:
|
||||
self.log.warning("Unknown quota code for instance type: %s",
|
||||
self.log.warning(
|
||||
"Unknown quota code for instance type: %s",
|
||||
instance_type)
|
||||
continue
|
||||
with self.non_mutating_rate_limiter:
|
||||
@ -388,7 +402,9 @@ class AwsAdapter(statemachine.Adapter):
|
||||
return QuotaInformation(**args)
|
||||
|
||||
def getQuotaForLabel(self, label):
|
||||
return self._getQuotaForInstanceType(label.instance_type)
|
||||
return self._getQuotaForInstanceType(
|
||||
label.instance_type,
|
||||
SPOT if label.use_spot else ON_DEMAND)
|
||||
|
||||
def uploadImage(self, provider_image, image_name, filename,
|
||||
image_format, metadata, md5, sha256):
|
||||
@ -753,18 +769,19 @@ class AwsAdapter(statemachine.Adapter):
|
||||
|
||||
instance_key_re = re.compile(r'([a-z\-]+)\d.*')
|
||||
|
||||
def _getQuotaCodeForInstanceType(self, instance_type):
|
||||
def _getQuotaCodeForInstanceType(self, instance_type, market_type_option):
|
||||
m = self.instance_key_re.match(instance_type)
|
||||
if m:
|
||||
key = m.group(1)
|
||||
return QUOTA_CODES.get(key)
|
||||
return QUOTA_CODES.get(key)[market_type_option]
|
||||
|
||||
def _getQuotaForInstanceType(self, instance_type):
|
||||
def _getQuotaForInstanceType(self, instance_type, market_type_option):
|
||||
itype = self._getInstanceType(instance_type)
|
||||
cores = itype['InstanceTypes'][0]['VCpuInfo']['DefaultCores']
|
||||
vcpus = itype['InstanceTypes'][0]['VCpuInfo']['DefaultVCpus']
|
||||
ram = itype['InstanceTypes'][0]['MemoryInfo']['SizeInMiB']
|
||||
code = self._getQuotaCodeForInstanceType(instance_type)
|
||||
code = self._getQuotaCodeForInstanceType(instance_type,
|
||||
market_type_option)
|
||||
# We include cores to match the overall cores quota (which may
|
||||
# be set as a tenant resource limit), and include vCPUs for the
|
||||
# specific AWS quota code which in for a specific instance
|
||||
@ -967,6 +984,16 @@ class AwsAdapter(statemachine.Adapter):
|
||||
del mapping['Ebs']['Encrypted']
|
||||
args['BlockDeviceMappings'] = [mapping]
|
||||
|
||||
# enable EC2 Spot
|
||||
if label.use_spot:
|
||||
args['InstanceMarketOptions'] = {
|
||||
'MarketType': 'spot',
|
||||
'SpotOptions': {
|
||||
'SpotInstanceType': 'one-time',
|
||||
'InstanceInterruptionBehavior': 'terminate'
|
||||
}
|
||||
}
|
||||
|
||||
with self.rate_limiter(log.debug, "Created instance"):
|
||||
log.debug(f"Creating VM {hostname}")
|
||||
instances = self.ec2.create_instances(**args)
|
||||
|
@ -179,6 +179,7 @@ class AwsLabel(ConfigValue):
|
||||
self.tags = label.get('tags', {})
|
||||
self.dynamic_tags = label.get('dynamic-tags', {})
|
||||
self.host_key_checking = self.pool.host_key_checking
|
||||
self.use_spot = bool(label.get('use-spot', False))
|
||||
|
||||
@staticmethod
|
||||
def getSchema():
|
||||
@ -200,6 +201,7 @@ class AwsLabel(ConfigValue):
|
||||
},
|
||||
'tags': dict,
|
||||
'dynamic-tags': dict,
|
||||
'use-spot': bool,
|
||||
}
|
||||
|
||||
|
||||
|
11
nodepool/tests/fixtures/aws/aws-quota.yaml
vendored
11
nodepool/tests/fixtures/aws/aws-quota.yaml
vendored
@ -15,6 +15,8 @@ tenant-resource-limits:
|
||||
labels:
|
||||
- name: standard
|
||||
- name: high
|
||||
- name: spot
|
||||
- name: on-demand
|
||||
|
||||
providers:
|
||||
- name: ec2-us-west-2
|
||||
@ -41,3 +43,12 @@ providers:
|
||||
cloud-image: ubuntu1404
|
||||
instance-type: u-6tb1.112xlarge
|
||||
key-name: zuul
|
||||
- name: spot
|
||||
cloud-image: ubuntu1404
|
||||
instance-type: m6i.32xlarge
|
||||
key-name: zuul
|
||||
use-spot: True
|
||||
- name: on-demand
|
||||
cloud-image: ubuntu1404
|
||||
instance-type: m6i.32xlarge
|
||||
key-name: zuul
|
||||
|
39
nodepool/tests/fixtures/aws/aws-spot.yaml
vendored
Normal file
39
nodepool/tests/fixtures/aws/aws-spot.yaml
vendored
Normal file
@ -0,0 +1,39 @@
|
||||
zookeeper-servers:
|
||||
- host: {zookeeper_host}
|
||||
port: {zookeeper_port}
|
||||
chroot: {zookeeper_chroot}
|
||||
|
||||
zookeeper-tls:
|
||||
ca: {zookeeper_ca}
|
||||
cert: {zookeeper_cert}
|
||||
key: {zookeeper_key}
|
||||
|
||||
tenant-resource-limits:
|
||||
- tenant-name: tenant-1
|
||||
max-cores: 1024
|
||||
|
||||
labels:
|
||||
- name: ubuntu1404-spot
|
||||
|
||||
providers:
|
||||
- name: ec2-us-west-2
|
||||
driver: aws
|
||||
region-name: us-west-2
|
||||
cloud-images:
|
||||
- name: ubuntu1404
|
||||
image-id: ami-1e749f67
|
||||
username: ubuntu
|
||||
pools:
|
||||
- name: main
|
||||
max-servers: 10
|
||||
subnet-id: {subnet_id}
|
||||
security-group-id: {security_group_id}
|
||||
node-attributes:
|
||||
key1: value1
|
||||
key2: value2
|
||||
labels:
|
||||
- name: ubuntu1404-spot
|
||||
cloud-image: ubuntu1404
|
||||
instance-type: t3.medium
|
||||
key-name: zuul-spot
|
||||
use-spot: True
|
@ -244,6 +244,7 @@ class TestDriverAws(tests.DBTestCase):
|
||||
@aws_quotas({
|
||||
'L-1216C47A': 2,
|
||||
'L-43DA4232': 448,
|
||||
'L-34B43A08': 2
|
||||
})
|
||||
def test_aws_multi_quota(self):
|
||||
# Test multiple instance type quotas (standard and high-mem)
|
||||
@ -295,6 +296,59 @@ class TestDriverAws(tests.DBTestCase):
|
||||
req3 = self.waitForNodeRequest(req3)
|
||||
self.assertSuccess(req3)
|
||||
|
||||
@aws_quotas({
|
||||
'L-43DA4232': 448,
|
||||
'L-1216C47A': 200,
|
||||
'L-34B43A08': 200
|
||||
})
|
||||
def test_aws_multi_quota_spot(self):
|
||||
# Test multiple instance type quotas (standard, high-mem and spot)
|
||||
configfile = self.setup_config('aws/aws-quota.yaml')
|
||||
pool = self.useNodepool(configfile, watermark_sleep=1)
|
||||
pool.start()
|
||||
|
||||
# Create a spot node request which should succeed.
|
||||
req1 = zk.NodeRequest()
|
||||
req1.state = zk.REQUESTED
|
||||
req1.node_types.append('spot')
|
||||
self.zk.storeNodeRequest(req1)
|
||||
self.log.debug("Waiting for request %s", req1.id)
|
||||
req1 = self.waitForNodeRequest(req1)
|
||||
node1 = self.assertSuccess(req1)
|
||||
|
||||
# Create an on-demand node request which should succeed.
|
||||
req2 = zk.NodeRequest()
|
||||
req2.state = zk.REQUESTED
|
||||
req2.node_types.append('on-demand')
|
||||
self.zk.storeNodeRequest(req2)
|
||||
self.log.debug("Waiting for request %s", req2.id)
|
||||
req2 = self.waitForNodeRequest(req2)
|
||||
self.assertSuccess(req2)
|
||||
|
||||
# Create another spot node request which should be paused.
|
||||
req3 = zk.NodeRequest()
|
||||
req3.state = zk.REQUESTED
|
||||
req3.node_types.append('spot')
|
||||
self.zk.storeNodeRequest(req3)
|
||||
self.log.debug("Waiting for request %s", req3.id)
|
||||
req3 = self.waitForNodeRequest(req3, (zk.PENDING,))
|
||||
|
||||
# Make sure we're paused while we attempt to fulfill the
|
||||
# third request.
|
||||
pool_worker = pool.getPoolWorkers('ec2-us-west-2')
|
||||
for _ in iterate_timeout(30, Exception, 'paused handler'):
|
||||
if pool_worker[0].paused_handlers:
|
||||
break
|
||||
|
||||
# Release the first spot node so that the third can be fulfilled.
|
||||
node1.state = zk.USED
|
||||
self.zk.storeNode(node1)
|
||||
self.waitForNodeDeletion(node1)
|
||||
|
||||
# Make sure the fourth spot node exists now.
|
||||
req3 = self.waitForNodeRequest(req3)
|
||||
self.assertSuccess(req3)
|
||||
|
||||
@aws_quotas({
|
||||
'L-1216C47A': 1000,
|
||||
'L-43DA4232': 1000,
|
||||
@ -916,3 +970,12 @@ class TestDriverAws(tests.DBTestCase):
|
||||
except botocore.exceptions.ClientError:
|
||||
# Probably not found
|
||||
break
|
||||
|
||||
def test_aws_provisioning_spot_instances(self):
|
||||
# Test creating a spot instances instead of an on-demand on.
|
||||
req = self.requestNode('aws/aws-spot.yaml', 'ubuntu1404-spot')
|
||||
node = self.assertSuccess(req)
|
||||
instance = self.ec2.Instance(node.external_id)
|
||||
self.assertEqual(instance.instance_lifecycle, 'spot')
|
||||
# moto doesn't provide the spot_instance_request_id
|
||||
# self.assertIsNotNone(instance.spot_instance_request_id)
|
||||
|
6
releasenotes/notes/aws-spot-9ef3d8ee39fde2b2.yaml
Normal file
6
releasenotes/notes/aws-spot-9ef3d8ee39fde2b2.yaml
Normal file
@ -0,0 +1,6 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
The AWS driver now supports launching Amazon EC2 Spot instances
|
||||
(https://aws.amazon.com/ec2/spot/), when specifying
|
||||
:attr:`providers.[aws].pools.labels.use-spot`.
|
Loading…
Reference in New Issue
Block a user