Allow configuring nodepool launch retries

Nodepool currently hardcodes that 3 attempts are made to upload an image
to the cloud. Allow modifying this in your provider configuration.

Change-Id: I61f44e163d419771824daa2039f7cdecc74742aa
This commit is contained in:
Jamie Lennox 2017-02-21 10:15:31 -05:00
parent 646c48800b
commit 71035081d5
7 changed files with 89 additions and 1 deletions

View File

@ -296,6 +296,7 @@ provider, the Nodepool image types are also defined (see
- az1
boot-timeout: 120
launch-timeout: 900
launch-retries: 3
image-name-format: 'template-{image_name}-{timestamp}'
hostname-format: '{label.name}-{provider.name}-{node.id}'
ipv6-preferred: False
@ -413,6 +414,13 @@ provider, the Nodepool image types are also defined (see
In seconds. Default 3600.
``launch-retries``
The number of times to retry launching a server before considering the job
failed.
Default 3.
``keypair``
Default None

View File

@ -74,6 +74,7 @@ class ConfigValidator:
'boot-timeout': int,
'api-timeout': int,
'launch-timeout': int,
'launch-retries': int,
'rate': float,
'images': [images],
'hostname-format': str,

View File

@ -182,6 +182,7 @@ def loadConfig(config_path):
p.api_timeout = provider.get('api-timeout')
p.boot_timeout = provider.get('boot-timeout', 60)
p.launch_timeout = provider.get('launch-timeout', 3600)
p.launch_retries = provider.get('launch-retries', 3)
p.networks = []
for network in provider.get('networks', []):
n = Network()

View File

@ -900,7 +900,8 @@ class NodeRequestHandler(object):
self.zk.storeNodeRequest(self.request)
self.launch_manager = NodeLaunchManager(
self.zk, self.provider, self.labels, self.manager, retries=3)
self.zk, self.provider, self.labels, self.manager,
retries=self.provider.launch_retries)
ready_nodes = self.zk.getReadyNodesOfTypes(self.request.node_types)
for ntype in self.request.node_types:

View File

@ -38,6 +38,7 @@ providers:
boot-timeout: 120
max-servers: 184
max-concurrency: 10
launch-retries: 3
rate: 0.001
images:
- name: trusty

View File

@ -0,0 +1,55 @@
elements-dir: .
images-dir: '{images_dir}'
cron:
check: '*/15 * * * *'
cleanup: '*/1 * * * *'
zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}
labels:
- name: fake-label
image: fake-image
min-ready: 0
providers:
- name: fake-provider
providers:
- name: fake-provider
region-name: fake-region
keypair: 'if-present-use-this-keypair'
username: 'fake'
password: 'fake'
auth-url: 'fake'
project-id: 'fake'
max-servers: 96
pool: 'fake'
launch-retries: 2
networks:
- net-id: 'some-uuid'
rate: 0.0001
images:
- name: fake-image
min-ram: 8192
name-filter: 'Fake'
meta:
key: value
key2: value
targets:
- name: fake-target
diskimages:
- name: fake-image
elements:
- fedora
- vm
release: 21
env-vars:
TMPDIR: /opt/dib_tmp
DIB_IMAGE_CACHE: /opt/dib_cache
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2

View File

@ -271,6 +271,27 @@ class TestNodepool(tests.DBTestCase):
self.assertEqual('fake-provider', new_nodes[0].provider)
self.assertNotEqual(nodes[0], new_nodes[0])
@mock.patch('nodepool.provider_manager.FakeProviderManager.createServer')
def test_node_launch_retries(self, mock_create_server):
mock_create_server.side_effect = Exception('Boom!')
configfile = self.setup_config('node_launch_retry.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)
self._useBuilder(configfile)
pool.start()
self.waitForImage('fake-provider', 'fake-image')
req = zk.NodeRequest()
req.state = zk.REQUESTED
req.node_types.append('fake-label')
self.zk.storeNodeRequest(req)
req = self.waitForNodeRequest(req)
self.assertEqual(req.state, zk.FAILED)
# retries in config is set to 2, so 2 attempts to create a server
self.assertEqual(2, mock_create_server.call_count)
@skip("Disabled for early v3 development")
def test_node_delete_failure(self):
def fail_delete(self, name):