diff --git a/doc/source/configuration.rst b/doc/source/configuration.rst index 1f4eea811..82a5b0016 100644 --- a/doc/source/configuration.rst +++ b/doc/source/configuration.rst @@ -296,6 +296,7 @@ provider, the Nodepool image types are also defined (see - az1 boot-timeout: 120 launch-timeout: 900 + launch-retries: 3 image-name-format: 'template-{image_name}-{timestamp}' hostname-format: '{label.name}-{provider.name}-{node.id}' ipv6-preferred: False @@ -413,6 +414,13 @@ provider, the Nodepool image types are also defined (see In seconds. Default 3600. + ``launch-retries`` + + The number of times to retry launching a server before considering the job + failed. + + Default 3. + ``keypair`` Default None diff --git a/nodepool/cmd/config_validator.py b/nodepool/cmd/config_validator.py index 65afd65cc..f49ffe3d2 100644 --- a/nodepool/cmd/config_validator.py +++ b/nodepool/cmd/config_validator.py @@ -74,6 +74,7 @@ class ConfigValidator: 'boot-timeout': int, 'api-timeout': int, 'launch-timeout': int, + 'launch-retries': int, 'rate': float, 'images': [images], 'hostname-format': str, diff --git a/nodepool/config.py b/nodepool/config.py index 7a49152ba..0f76c551b 100644 --- a/nodepool/config.py +++ b/nodepool/config.py @@ -182,6 +182,7 @@ def loadConfig(config_path): p.api_timeout = provider.get('api-timeout') p.boot_timeout = provider.get('boot-timeout', 60) p.launch_timeout = provider.get('launch-timeout', 3600) + p.launch_retries = provider.get('launch-retries', 3) p.networks = [] for network in provider.get('networks', []): n = Network() diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index c26ba1094..c55f130ad 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -900,7 +900,8 @@ class NodeRequestHandler(object): self.zk.storeNodeRequest(self.request) self.launch_manager = NodeLaunchManager( - self.zk, self.provider, self.labels, self.manager, retries=3) + self.zk, self.provider, self.labels, self.manager, + retries=self.provider.launch_retries) ready_nodes = self.zk.getReadyNodesOfTypes(self.request.node_types) for ntype in self.request.node_types: diff --git a/nodepool/tests/fixtures/config_validate/good.yaml b/nodepool/tests/fixtures/config_validate/good.yaml index 087bdfa9e..0da775bdf 100644 --- a/nodepool/tests/fixtures/config_validate/good.yaml +++ b/nodepool/tests/fixtures/config_validate/good.yaml @@ -38,6 +38,7 @@ providers: boot-timeout: 120 max-servers: 184 max-concurrency: 10 + launch-retries: 3 rate: 0.001 images: - name: trusty diff --git a/nodepool/tests/fixtures/node_launch_retry.yaml b/nodepool/tests/fixtures/node_launch_retry.yaml new file mode 100644 index 000000000..eee985ed2 --- /dev/null +++ b/nodepool/tests/fixtures/node_launch_retry.yaml @@ -0,0 +1,55 @@ +elements-dir: . +images-dir: '{images_dir}' + +cron: + check: '*/15 * * * *' + cleanup: '*/1 * * * *' + +zookeeper-servers: + - host: {zookeeper_host} + port: {zookeeper_port} + chroot: {zookeeper_chroot} + +labels: + - name: fake-label + image: fake-image + min-ready: 0 + providers: + - name: fake-provider + +providers: + - name: fake-provider + region-name: fake-region + keypair: 'if-present-use-this-keypair' + username: 'fake' + password: 'fake' + auth-url: 'fake' + project-id: 'fake' + max-servers: 96 + pool: 'fake' + launch-retries: 2 + networks: + - net-id: 'some-uuid' + rate: 0.0001 + images: + - name: fake-image + min-ram: 8192 + name-filter: 'Fake' + meta: + key: value + key2: value + +targets: + - name: fake-target + +diskimages: + - name: fake-image + elements: + - fedora + - vm + release: 21 + env-vars: + TMPDIR: /opt/dib_tmp + DIB_IMAGE_CACHE: /opt/dib_cache + DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/ + BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2 diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index f2e88e36c..74732890a 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -271,6 +271,27 @@ class TestNodepool(tests.DBTestCase): self.assertEqual('fake-provider', new_nodes[0].provider) self.assertNotEqual(nodes[0], new_nodes[0]) + @mock.patch('nodepool.provider_manager.FakeProviderManager.createServer') + def test_node_launch_retries(self, mock_create_server): + mock_create_server.side_effect = Exception('Boom!') + + configfile = self.setup_config('node_launch_retry.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + self._useBuilder(configfile) + pool.start() + self.waitForImage('fake-provider', 'fake-image') + + req = zk.NodeRequest() + req.state = zk.REQUESTED + req.node_types.append('fake-label') + self.zk.storeNodeRequest(req) + + req = self.waitForNodeRequest(req) + self.assertEqual(req.state, zk.FAILED) + + # retries in config is set to 2, so 2 attempts to create a server + self.assertEqual(2, mock_create_server.call_count) + @skip("Disabled for early v3 development") def test_node_delete_failure(self): def fail_delete(self, name):