Amazon EC2 Spot support

This adds support for launching Amazon EC2 Spot instances
(https://aws.amazon.com/ec2/spot/), which comes with huge cost saving
opportunities.

Amazon EC2 Spot instances are spare Amazon EC2 capacity, you can get
with an discount of up to 90% compared to on-demand pricing.
In contrast to on-demand instances, Spot instances can be relaimed with a
2 minute notification in advance
(https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-interruptions.html).

When :attr:`providers.[aws].pools.labels.use-spot` is set to True, the AWS
driver will launch Spot instances. If an instance get interrupted, it will be
terminated and no replacement instance will be launched.

Change-Id: I9868d014991d78e7b2421439403ae1371b33524c
This commit is contained in:
Christian Mueller 2022-11-24 12:32:37 +01:00
parent 0e7de19664
commit 36dbff84ba
7 changed files with 213 additions and 40 deletions

View File

@ -66,6 +66,7 @@ Selecting the ``aws`` driver adds the following options to the
cloud-image: debian9
instance-type: t3.large
key-name: zuul
use-spot: True
tags:
key1: value1
key2: value2
@ -741,6 +742,30 @@ Selecting the ``aws`` driver adds the following options to the
dynamic-tags:
request_info: "Created for request {request.id}"
.. attr:: use-spot
:type: bool
:default: False
When set to True, Nodepool will try to launch an Amazon EC2 Spot
instance, instead of an On-Demand instance. Spot instances let
you take advantage of unused EC2 capacity at a discount.
For example:
.. code-block:: yaml
labels:
- name: frugal
use-spot: True
.. note:: As Amazon EC2 Spot instances take advantage of unused
EC2 capacity, you may not get an instance, if demand
is high. In addition, Amazon EC2 may interrupt your
Spot instance and reclaim it with a two minutes warning
upfront. Therefore, you might want to setup alternative
nodesets as fallback.
.. _`EBS volume type`: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EBSVolumeTypes.html
.. _`AWS region`: https://docs.aws.amazon.com/general/latest/gr/rande.html
.. _`Boto configuration`: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html

View File

@ -60,26 +60,31 @@ def tag_list_to_dict(taglist):
# https://aws.amazon.com/ec2/instance-types/high-memory/
QUOTA_CODES = {
'a': 'L-1216C47A',
'c': 'L-1216C47A',
'd': 'L-1216C47A',
'h': 'L-1216C47A',
'i': 'L-1216C47A',
'm': 'L-1216C47A',
'r': 'L-1216C47A',
't': 'L-1216C47A',
'z': 'L-1216C47A',
'dl': 'L-6E869C2A',
'f': 'L-74FC7D96',
'g': 'L-DB2E81BA',
'vt': 'L-DB2E81BA',
'u-': 'L-43DA4232', # 'high memory'
'inf': 'L-1945791B',
'p': 'L-417A185B',
'x': 'L-7295265B',
# INSTANCE FAMILY: [ON-DEMAND, SPOT]
'a': ['L-1216C47A', 'L-34B43A08'],
'c': ['L-1216C47A', 'L-34B43A08'],
'd': ['L-1216C47A', 'L-34B43A08'],
'h': ['L-1216C47A', 'L-34B43A08'],
'i': ['L-1216C47A', 'L-34B43A08'],
'm': ['L-1216C47A', 'L-34B43A08'],
'r': ['L-1216C47A', 'L-34B43A08'],
't': ['L-1216C47A', 'L-34B43A08'],
'z': ['L-1216C47A', 'L-34B43A08'],
'dl': ['L-6E869C2A', 'L-85EED4F7'],
'f': ['L-74FC7D96', 'L-88CF9481'],
'g': ['L-DB2E81BA', 'L-3819A6DF'],
'vt': ['L-DB2E81BA', 'L-3819A6DF'],
'u-': ['L-43DA4232', ''], # 'high memory'
'inf': ['L-1945791B', 'L-B5D1601B'],
'p': ['L-417A185B', 'L-7212CCBC'],
'x': ['L-7295265B', 'L-E3A00192'],
'trn': ['L-2C3B7624', 'L-6B0D517C'],
'hpc': ['L-F7808C92', '']
}
CACHE_TTL = 10
ON_DEMAND = 0
SPOT = 1
class AwsInstance(statemachine.Instance):
@ -183,7 +188,8 @@ class AwsCreateStateMachine(statemachine.StateMachine):
return
self.instance = instance
self.quota = self.adapter._getQuotaForInstanceType(
self.instance.instance_type)
self.instance.instance_type,
SPOT if self.label.use_spot else ON_DEMAND)
self.state = self.INSTANCE_CREATING
if self.state == self.INSTANCE_CREATING:
@ -360,22 +366,30 @@ class AwsAdapter(statemachine.Adapter):
for instance in self._listInstances():
if instance.state["Name"].lower() == "terminated":
continue
quota = self._getQuotaForInstanceType(instance.instance_type)
quota = self._getQuotaForInstanceType(
instance.instance_type,
SPOT if instance.instance_lifecycle == 'spot' else ON_DEMAND)
yield AwsInstance(self.provider, instance, quota)
def getQuotaLimits(self):
# Get the instance types that this provider handles
instance_types = set()
instance_types = {}
for pool in self.provider.pools.values():
for label in pool.labels.values():
instance_types.add(label.instance_type)
if label.instance_type not in instance_types:
instance_types[label.instance_type] = set()
instance_types[label.instance_type].add(
SPOT if label.use_spot else ON_DEMAND)
args = dict(default=math.inf)
for instance_type in instance_types:
code = self._getQuotaCodeForInstanceType(instance_type)
for market_type_option in instance_types[instance_type]:
code = self._getQuotaCodeForInstanceType(instance_type,
market_type_option)
if code in args:
continue
if not code:
self.log.warning("Unknown quota code for instance type: %s",
self.log.warning(
"Unknown quota code for instance type: %s",
instance_type)
continue
with self.non_mutating_rate_limiter:
@ -388,7 +402,9 @@ class AwsAdapter(statemachine.Adapter):
return QuotaInformation(**args)
def getQuotaForLabel(self, label):
return self._getQuotaForInstanceType(label.instance_type)
return self._getQuotaForInstanceType(
label.instance_type,
SPOT if label.use_spot else ON_DEMAND)
def uploadImage(self, provider_image, image_name, filename,
image_format, metadata, md5, sha256):
@ -753,18 +769,19 @@ class AwsAdapter(statemachine.Adapter):
instance_key_re = re.compile(r'([a-z\-]+)\d.*')
def _getQuotaCodeForInstanceType(self, instance_type):
def _getQuotaCodeForInstanceType(self, instance_type, market_type_option):
m = self.instance_key_re.match(instance_type)
if m:
key = m.group(1)
return QUOTA_CODES.get(key)
return QUOTA_CODES.get(key)[market_type_option]
def _getQuotaForInstanceType(self, instance_type):
def _getQuotaForInstanceType(self, instance_type, market_type_option):
itype = self._getInstanceType(instance_type)
cores = itype['InstanceTypes'][0]['VCpuInfo']['DefaultCores']
vcpus = itype['InstanceTypes'][0]['VCpuInfo']['DefaultVCpus']
ram = itype['InstanceTypes'][0]['MemoryInfo']['SizeInMiB']
code = self._getQuotaCodeForInstanceType(instance_type)
code = self._getQuotaCodeForInstanceType(instance_type,
market_type_option)
# We include cores to match the overall cores quota (which may
# be set as a tenant resource limit), and include vCPUs for the
# specific AWS quota code which in for a specific instance
@ -967,6 +984,16 @@ class AwsAdapter(statemachine.Adapter):
del mapping['Ebs']['Encrypted']
args['BlockDeviceMappings'] = [mapping]
# enable EC2 Spot
if label.use_spot:
args['InstanceMarketOptions'] = {
'MarketType': 'spot',
'SpotOptions': {
'SpotInstanceType': 'one-time',
'InstanceInterruptionBehavior': 'terminate'
}
}
with self.rate_limiter(log.debug, "Created instance"):
log.debug(f"Creating VM {hostname}")
instances = self.ec2.create_instances(**args)

View File

@ -179,6 +179,7 @@ class AwsLabel(ConfigValue):
self.tags = label.get('tags', {})
self.dynamic_tags = label.get('dynamic-tags', {})
self.host_key_checking = self.pool.host_key_checking
self.use_spot = bool(label.get('use-spot', False))
@staticmethod
def getSchema():
@ -200,6 +201,7 @@ class AwsLabel(ConfigValue):
},
'tags': dict,
'dynamic-tags': dict,
'use-spot': bool,
}

View File

@ -15,6 +15,8 @@ tenant-resource-limits:
labels:
- name: standard
- name: high
- name: spot
- name: on-demand
providers:
- name: ec2-us-west-2
@ -41,3 +43,12 @@ providers:
cloud-image: ubuntu1404
instance-type: u-6tb1.112xlarge
key-name: zuul
- name: spot
cloud-image: ubuntu1404
instance-type: m6i.32xlarge
key-name: zuul
use-spot: True
- name: on-demand
cloud-image: ubuntu1404
instance-type: m6i.32xlarge
key-name: zuul

View File

@ -0,0 +1,39 @@
zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}
zookeeper-tls:
ca: {zookeeper_ca}
cert: {zookeeper_cert}
key: {zookeeper_key}
tenant-resource-limits:
- tenant-name: tenant-1
max-cores: 1024
labels:
- name: ubuntu1404-spot
providers:
- name: ec2-us-west-2
driver: aws
region-name: us-west-2
cloud-images:
- name: ubuntu1404
image-id: ami-1e749f67
username: ubuntu
pools:
- name: main
max-servers: 10
subnet-id: {subnet_id}
security-group-id: {security_group_id}
node-attributes:
key1: value1
key2: value2
labels:
- name: ubuntu1404-spot
cloud-image: ubuntu1404
instance-type: t3.medium
key-name: zuul-spot
use-spot: True

View File

@ -244,6 +244,7 @@ class TestDriverAws(tests.DBTestCase):
@aws_quotas({
'L-1216C47A': 2,
'L-43DA4232': 448,
'L-34B43A08': 2
})
def test_aws_multi_quota(self):
# Test multiple instance type quotas (standard and high-mem)
@ -295,6 +296,59 @@ class TestDriverAws(tests.DBTestCase):
req3 = self.waitForNodeRequest(req3)
self.assertSuccess(req3)
@aws_quotas({
'L-43DA4232': 448,
'L-1216C47A': 200,
'L-34B43A08': 200
})
def test_aws_multi_quota_spot(self):
# Test multiple instance type quotas (standard, high-mem and spot)
configfile = self.setup_config('aws/aws-quota.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)
pool.start()
# Create a spot node request which should succeed.
req1 = zk.NodeRequest()
req1.state = zk.REQUESTED
req1.node_types.append('spot')
self.zk.storeNodeRequest(req1)
self.log.debug("Waiting for request %s", req1.id)
req1 = self.waitForNodeRequest(req1)
node1 = self.assertSuccess(req1)
# Create an on-demand node request which should succeed.
req2 = zk.NodeRequest()
req2.state = zk.REQUESTED
req2.node_types.append('on-demand')
self.zk.storeNodeRequest(req2)
self.log.debug("Waiting for request %s", req2.id)
req2 = self.waitForNodeRequest(req2)
self.assertSuccess(req2)
# Create another spot node request which should be paused.
req3 = zk.NodeRequest()
req3.state = zk.REQUESTED
req3.node_types.append('spot')
self.zk.storeNodeRequest(req3)
self.log.debug("Waiting for request %s", req3.id)
req3 = self.waitForNodeRequest(req3, (zk.PENDING,))
# Make sure we're paused while we attempt to fulfill the
# third request.
pool_worker = pool.getPoolWorkers('ec2-us-west-2')
for _ in iterate_timeout(30, Exception, 'paused handler'):
if pool_worker[0].paused_handlers:
break
# Release the first spot node so that the third can be fulfilled.
node1.state = zk.USED
self.zk.storeNode(node1)
self.waitForNodeDeletion(node1)
# Make sure the fourth spot node exists now.
req3 = self.waitForNodeRequest(req3)
self.assertSuccess(req3)
@aws_quotas({
'L-1216C47A': 1000,
'L-43DA4232': 1000,
@ -916,3 +970,12 @@ class TestDriverAws(tests.DBTestCase):
except botocore.exceptions.ClientError:
# Probably not found
break
def test_aws_provisioning_spot_instances(self):
# Test creating a spot instances instead of an on-demand on.
req = self.requestNode('aws/aws-spot.yaml', 'ubuntu1404-spot')
node = self.assertSuccess(req)
instance = self.ec2.Instance(node.external_id)
self.assertEqual(instance.instance_lifecycle, 'spot')
# moto doesn't provide the spot_instance_request_id
# self.assertIsNotNone(instance.spot_instance_request_id)

View File

@ -0,0 +1,6 @@
---
features:
- |
The AWS driver now supports launching Amazon EC2 Spot instances
(https://aws.amazon.com/ec2/spot/), when specifying
:attr:`providers.[aws].pools.labels.use-spot`.