From 7d02b635c34d01b6a546be0b4ff2998c5e2c5117 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Wed, 14 Dec 2016 18:12:32 -0500 Subject: [PATCH 001/309] Add missing pause fields to config-validate Change-Id: If1148acf1572ac7d03b7b1c3e349e8648a096ce7 Signed-off-by: Paul Belanger --- nodepool/cmd/config_validator.py | 2 ++ nodepool/tests/fixtures/config_validate/good.yaml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/nodepool/cmd/config_validator.py b/nodepool/cmd/config_validator.py index 4fd7b7e62..639296b47 100644 --- a/nodepool/cmd/config_validator.py +++ b/nodepool/cmd/config_validator.py @@ -31,6 +31,7 @@ class ConfigValidator: images = { 'name': str, + 'pause': bool, 'min-ram': int, 'name-filter': str, 'diskimage': str, @@ -104,6 +105,7 @@ class ConfigValidator: diskimages = { 'name': str, + 'pause': bool, 'elements': [str], 'release': v.Any(str, int), 'rebuild-age': int, diff --git a/nodepool/tests/fixtures/config_validate/good.yaml b/nodepool/tests/fixtures/config_validate/good.yaml index 0b8a1ce76..a69d4c0a4 100644 --- a/nodepool/tests/fixtures/config_validate/good.yaml +++ b/nodepool/tests/fixtures/config_validate/good.yaml @@ -70,6 +70,7 @@ providers: rate: 0.001 images: - name: trusty + pause: False min-ram: 8192 username: jenkins private-key: /home/nodepool/.ssh/id_rsa @@ -79,6 +80,7 @@ targets: diskimages: - name: trusty + pause: False elements: - ubuntu - vm From 9f28b4305281affd3e7af23d9bcf12f48f40723b Mon Sep 17 00:00:00 2001 From: Clark Boylan Date: Wed, 14 Dec 2016 15:24:47 -0800 Subject: [PATCH 002/309] Validate configs when used by tests We have somewhat frequently failed to update our voluptuous schema when adding new content to our config because the tests for it are in a corner. Address this by testing that every test's config validates properly when a test applies a config. This means that whenever we add tests for a new feature it needs to have working config validation too. Change-Id: Ie4452747baaf3d89d51da8f252366a6919f4d10a --- nodepool/tests/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py index 3a3ed5769..d6f8f0ab2 100644 --- a/nodepool/tests/__init__.py +++ b/nodepool/tests/__init__.py @@ -37,6 +37,7 @@ import testtools from nodepool import allocation, builder, fakeprovider, nodepool, nodedb, webapp from nodepool import zk +from nodepool.cmd.config_validator import ConfigValidator TRUE_VALUES = ('true', '1', 'yes') @@ -427,6 +428,8 @@ class DBTestCase(BaseTestCase): zookeeper_chroot=self.zookeeper_chroot)) os.close(fd) self._config_images_dir = images_dir + validator = ConfigValidator(path) + validator.validate() return path def replace_config(self, configfile, filename): From ed6050c74cf80f4a1f78a65ea5ec33e4d39dd729 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Fri, 16 Dec 2016 07:59:40 -0800 Subject: [PATCH 003/309] Fix image delete exception logging This line had an extra format item which may have caused it to malfunction. Change-Id: I55282f51aeb3e3512321a1ef9a60aff066159b56 --- nodepool/builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nodepool/builder.py b/nodepool/builder.py index 29614b393..6ccb05065 100644 --- a/nodepool/builder.py +++ b/nodepool/builder.py @@ -311,7 +311,7 @@ class CleanupWorker(BaseWorker): manager.deleteImage(upload.external_name) except Exception: self.log.exception( - "Unable to delete image %s from %s: %s", + "Unable to delete image %s from %s:", upload.external_name, upload.provider_name) else: self._zk.deleteUpload(upload.image_name, upload.build_id, From 70d1b8fd37311e1371b4a80bc5b8d83323caa8d4 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Tue, 3 Jan 2017 14:40:31 -0500 Subject: [PATCH 004/309] Register launcher name with ZooKeeper Change-Id: I679590823dd37b09a8962ff934c497d40a9182e0 --- nodepool/nodepool.py | 6 ++++++ nodepool/tests/test_zk.py | 15 +++++++++++++++ nodepool/zk.py | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 875068a00..5fb5bb2ae 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -21,10 +21,12 @@ import apscheduler.triggers.cron import gear import json import logging +import os import os.path import paramiko import pprint import random +import socket import threading import time import zmq @@ -883,6 +885,9 @@ class NodePool(threading.Thread): self._instance_delete_threads = {} self._instance_delete_threads_lock = threading.Lock() self._wake_condition = threading.Condition() + self.launcher_id = "%s-%s-%s" % (socket.gethostname(), + os.getpid(), + self.ident) def stop(self): self._stopped = True @@ -1253,6 +1258,7 @@ class NodePool(threading.Thread): def startup(self): self.updateConfig() + self.zk.registerLauncher(self.launcher_id) # Currently nodepool can not resume building a node or image # after a restart. To clean up, mark all building node and diff --git a/nodepool/tests/test_zk.py b/nodepool/tests/test_zk.py index 6edc63771..f2b317134 100644 --- a/nodepool/tests/test_zk.py +++ b/nodepool/tests/test_zk.py @@ -476,6 +476,21 @@ class TestZooKeeper(tests.DBTestCase): self.zk.deleteUpload("trusty", "000", "rax", "000001") self.assertIsNone(self.zk.client.exists(path)) + def test_registerLauncher(self): + name = "launcher-000-001" + self.zk.registerLauncher(name) + launchers = self.zk.getRegisteredLaunchers() + self.assertEqual(1, len(launchers)) + self.assertEqual(name, launchers[0]) + + def test_registerLauncher_safe_repeat(self): + name = "launcher-000-001" + self.zk.registerLauncher(name) + self.zk.registerLauncher(name) + launchers = self.zk.getRegisteredLaunchers() + self.assertEqual(1, len(launchers)) + self.assertEqual(name, launchers[0]) + class TestZKModel(tests.BaseTestCase): diff --git a/nodepool/zk.py b/nodepool/zk.py index 930367862..65e7cc034 100644 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -299,6 +299,7 @@ class ZooKeeper(object): log = logging.getLogger("nodepool.zk.ZooKeeper") IMAGE_ROOT = "/nodepool/images" + LAUNCHER_ROOT = "/nodepool/launchers" def __init__(self, client=None): ''' @@ -353,6 +354,9 @@ class ZooKeeper(object): return "%s/lock" % self._imageUploadPath(image, build_number, provider) + def _launcherPath(self, launcher): + return "%s/%s" % (self.LAUNCHER_ROOT, launcher) + def _dictToStr(self, data): return json.dumps(data) @@ -1030,3 +1034,33 @@ class ZooKeeper(object): self.client.delete(path) except kze.NoNodeError: pass + + def registerLauncher(self, launcher): + ''' + Register an active node launcher. + + The launcher is automatically de-registered once it terminates or + otherwise disconnects from ZooKeeper. It will need to re-register + after a lost connection. This method is safe to call multiple times. + + :param str launcher: Unique name for the launcher. + ''' + path = self._launcherPath(launcher) + + try: + self.client.create(path, makepath=True, ephemeral=True) + except kze.NodeExistsError: + pass + + def getRegisteredLaunchers(self): + ''' + Get a list of all launchers that have registered with ZooKeeper. + + :returns: A list of launcher names, or empty list if none are found. + ''' + try: + launchers = self.client.get_children(self.LAUNCHER_ROOT) + except kze.NoNodeError: + return [] + + return launchers From 08b720364f30ee4e8ad37822b5eccd3a9b429d74 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Wed, 4 Jan 2017 12:14:31 -0500 Subject: [PATCH 005/309] Add ZK API methods for node requests Adds ZooKeeper API methods to get the current list of outstanding node requests, and to get the data for an individual node request. A new NodeRequest object is introduced to the data model. The model will be expanded on in future reviews. Change-Id: I4af96e4e307cc5ce5d3208462e7335c24eece952 --- nodepool/tests/test_zk.py | 63 +++++++++++++++++++++++++++++--- nodepool/zk.py | 77 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 130 insertions(+), 10 deletions(-) diff --git a/nodepool/tests/test_zk.py b/nodepool/tests/test_zk.py index 30a0ada63..426ebe81b 100644 --- a/nodepool/tests/test_zk.py +++ b/nodepool/tests/test_zk.py @@ -437,27 +437,58 @@ class TestZooKeeper(tests.DBTestCase): self.assertEqual(1, len(launchers)) self.assertEqual(name, launchers[0]) + def test_getNodeRequests_empty(self): + self.assertEqual([], self.zk.getNodeRequests()) + + def test_getNodeRequests(self): + r1 = self.zk._requestPath("500-123") + r2 = self.zk._requestPath("100-456") + r3 = self.zk._requestPath("100-123") + r4 = self.zk._requestPath("400-123") + self.zk.client.create(r1, makepath=True, ephemeral=True) + self.zk.client.create(r2, makepath=True, ephemeral=True) + self.zk.client.create(r3, makepath=True, ephemeral=True) + self.zk.client.create(r4, makepath=True, ephemeral=True) + + self.assertEqual( + ["100-123", "100-456", "400-123", "500-123"], + self.zk.getNodeRequests() + ) + + def test_getNodeRequest(self): + r = zk.NodeRequest("500-123") + r.state = zk.READY + path = self.zk._requestPath(r.id) + self.zk.client.create(path, value=self.zk._dictToStr(r.toDict()), + makepath=True, ephemeral=True) + o = self.zk.getNodeRequest(r.id) + self.assertIsInstance(o, zk.NodeRequest) + self.assertEqual(r.id, o.id) + + def test_getNodeRequest_not_found(self): + self.assertIsNone(self.zk.getNodeRequest("invalid")) + class TestZKModel(tests.BaseTestCase): def setUp(self): super(TestZKModel, self).setUp() - def test_BaseBuilderModel_bad_id(self): + def test_BaseModel_bad_id(self): with testtools.ExpectedException( TypeError, "'id' attribute must be a string type" ): - zk.BaseBuilderModel(123) + zk.BaseModel(123) - def test_BaseBuilderModel_bad_state(self): + def test_BaseModel_bad_state(self): with testtools.ExpectedException( TypeError, "'blah' is not a valid state" ): - o = zk.BaseBuilderModel('0001') + o = zk.BaseModel('0001') o.state = 'blah' - def test_BaseBuilderModel_toDict(self): - o = zk.BaseBuilderModel('0001') + def test_BaseModel_toDict(self): + o = zk.BaseModel('0001') o.state = zk.BUILDING d = o.toDict() self.assertNotIn('id', d) @@ -524,3 +555,23 @@ class TestZKModel(tests.BaseTestCase): self.assertEqual(o.state_time, d['state_time']) self.assertEqual(o.external_id, d['external_id']) self.assertEqual(o.external_name, d['external_name']) + + def test_NodeRequest_toDict(self): + o = zk.NodeRequest("500-123") + d = o.toDict() + self.assertNotIn('id', d) + self.assertIn('state', d) + self.assertIn('state_time', d) + + def test_NodeRequest_fromDict(self): + now = int(time.time()) + req_id = "500-123" + d = { + 'state': zk.READY, + 'state_time': now + } + + o = zk.NodeRequest.fromDict(d, req_id) + self.assertEqual(o.id, req_id) + self.assertEqual(o.state, d['state']) + self.assertEqual(o.state_time, d['state_time']) diff --git a/nodepool/zk.py b/nodepool/zk.py index e6c0e2430..749cdd3aa 100644 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -106,7 +106,7 @@ class ZooKeeperWatchEvent(object): self.image = image -class BaseBuilderModel(object): +class BaseModel(object): def __init__(self, o_id): if o_id: self.id = o_id @@ -137,7 +137,7 @@ class BaseBuilderModel(object): def toDict(self): ''' - Convert a BaseBuilderModel object's attributes to a dictionary. + Convert a BaseModel object's attributes to a dictionary. ''' d = {} d['state'] = self.state @@ -157,7 +157,7 @@ class BaseBuilderModel(object): self.state_time = d['state_time'] -class ImageBuild(BaseBuilderModel): +class ImageBuild(BaseModel): ''' Class representing a DIB image build within the ZooKeeper cluster. ''' @@ -216,7 +216,7 @@ class ImageBuild(BaseBuilderModel): return o -class ImageUpload(BaseBuilderModel): +class ImageUpload(BaseModel): ''' Class representing a provider image upload within the ZooKeeper cluster. ''' @@ -277,6 +277,42 @@ class ImageUpload(BaseBuilderModel): return o +class NodeRequest(BaseModel): + ''' + Class representing a node request. + ''' + + def __init__(self, id=None): + super(NodeRequest, self).__init__(id) + + def __repr__(self): + d = self.toDict() + d['id'] = self.id + d['stat'] = self.stat + return '' % d + + def toDict(self): + ''' + Convert a NodeRequest object's attributes to a dictionary. + ''' + d = super(NodeRequest, self).toDict() + return d + + @staticmethod + def fromDict(d, o_id=None): + ''' + Create a NodeRequest object from a dictionary. + + :param dict d: The dictionary. + :param str o_id: The object ID. + + :returns: An initialized ImageBuild object. + ''' + o = NodeRequest(o_id) + super(NodeRequest, o).fromDict(d) + return o + + class ZooKeeper(object): ''' Class implementing the ZooKeeper interface. @@ -297,6 +333,7 @@ class ZooKeeper(object): IMAGE_ROOT = "/nodepool/images" LAUNCHER_ROOT = "/nodepool/launchers" + REQUEST_ROOT = "/nodepool/requests" def __init__(self): ''' @@ -341,6 +378,9 @@ class ZooKeeper(object): def _launcherPath(self, launcher): return "%s/%s" % (self.LAUNCHER_ROOT, launcher) + def _requestPath(self, request): + return "%s/%s" % (self.REQUEST_ROOT, request) + def _dictToStr(self, data): return json.dumps(data) @@ -1024,3 +1064,32 @@ class ZooKeeper(object): return [] return launchers + + def getNodeRequests(self): + ''' + Get the current list of all node requests in priority sorted order. + + :returns: A list of request nodes. + ''' + try: + requests = self.client.get_children(self.REQUEST_ROOT) + except kze.NoNodeError: + return [] + + return sorted(requests) + + def getNodeRequest(self, request): + ''' + Get the data for a specific node request. + + :returns: The request data, or None if the request was not found. + ''' + path = self._requestPath(request) + try: + data, stat = self.client.get(path) + except kze.NoNodeError: + return None + + d = NodeRequest.fromDict(self._strToDict(data), request) + d.stat = stat + return d From eac6ca73f33d9cbcc22ba4eb269743e15d3980ba Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Thu, 5 Jan 2017 12:16:02 -0500 Subject: [PATCH 006/309] Disable tests for nodepoold changes Disables all tests that depend on a working nodepoold and removes obsolete tests since the database and gearman are going away. Change-Id: Ic973c8a657fbfa38523e73231bdb5fce53a81f3a --- nodepool/tests/test_builder.py | 4 +++ nodepool/tests/test_commands.py | 4 +++ nodepool/tests/test_nodepool.py | 60 +++++++++++---------------------- nodepool/tests/test_webapp.py | 2 ++ tools/check_devstack_plugin.sh | 2 ++ 5 files changed, 31 insertions(+), 41 deletions(-) diff --git a/nodepool/tests/test_builder.py b/nodepool/tests/test_builder.py index 6da05dc8f..7f7f2ff4b 100644 --- a/nodepool/tests/test_builder.py +++ b/nodepool/tests/test_builder.py @@ -85,6 +85,10 @@ class TestNodepoolBuilderDibImage(tests.BaseTestCase): self.assertRaises(exceptions.BuilderError, image.to_path, '/imagedir/') class TestNodePoolBuilder(tests.DBTestCase): + def setUp(self): + super(tests.DBTestCase, self).setUp() + self.skipTest("Disabled for early v3 development") + def test_start_stop(self): config = self.setup_config('node.yaml') nb = builder.NodePoolBuilder(config) diff --git a/nodepool/tests/test_commands.py b/nodepool/tests/test_commands.py index 387a66a9e..2100d71ae 100644 --- a/nodepool/tests/test_commands.py +++ b/nodepool/tests/test_commands.py @@ -27,6 +27,10 @@ from nodepool import zk class TestNodepoolCMD(tests.DBTestCase): + def setUp(self): + super(tests.DBTestCase, self).setUp() + self.skipTest("Disabled for early v3 development") + def patch_argv(self, *args): argv = ["nodepool", "-s", self.secure_conf] argv.extend(args) diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index 55bdb60de..40558709e 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -15,13 +15,11 @@ import json import logging -import threading import time from unittest import skip import fixtures -from nodepool import jobs from nodepool import tests from nodepool import nodedb import nodepool.fakeprovider @@ -31,11 +29,7 @@ import nodepool.nodepool class TestNodepool(tests.DBTestCase): log = logging.getLogger("nodepool.TestNodepool") - def test_db(self): - db = nodedb.NodeDatabase(self.dburi) - with db.getSession() as session: - session.getNodes() - + @skip("Disabled for early v3 development") def test_node(self): """Test that an image and node are created""" configfile = self.setup_config('node.yaml') @@ -52,6 +46,7 @@ class TestNodepool(tests.DBTestCase): state=nodedb.READY) self.assertEqual(len(nodes), 1) + @skip("Disabled for early v3 development") def test_disabled_label(self): """Test that an image and node are not created""" configfile = self.setup_config('node_disabled_label.yaml') @@ -68,6 +63,7 @@ class TestNodepool(tests.DBTestCase): state=nodedb.READY) self.assertEqual(len(nodes), 0) + @skip("Disabled for early v3 development") def test_node_net_name(self): """Test that a node is created with a net name""" configfile = self.setup_config('node_net_name.yaml') @@ -84,6 +80,7 @@ class TestNodepool(tests.DBTestCase): state=nodedb.READY) self.assertEqual(len(nodes), 1) + @skip("Disabled for early v3 development") def test_node_vhd_image(self): """Test that a image and node are created vhd image""" configfile = self.setup_config('node_vhd.yaml') @@ -100,6 +97,7 @@ class TestNodepool(tests.DBTestCase): state=nodedb.READY) self.assertEqual(len(nodes), 1) + @skip("Disabled for early v3 development") def test_node_vhd_and_qcow2(self): """Test label provided by vhd and qcow2 images builds""" configfile = self.setup_config('node_vhd_and_qcow2.yaml') @@ -122,6 +120,7 @@ class TestNodepool(tests.DBTestCase): state=nodedb.READY) self.assertEqual(len(nodes), 1) + @skip("Disabled for early v3 development") def test_dib_upload_fail(self): """Test that an image upload failure is contained.""" configfile = self.setup_config('node_upload_fail.yaml') @@ -143,6 +142,7 @@ class TestNodepool(tests.DBTestCase): state=nodedb.READY) self.assertEqual(len(nodes), 2) + @skip("Disabled for early v3 development") def test_subnodes(self): """Test that an image and node are created""" configfile = self.setup_config('subnodes.yaml') @@ -168,6 +168,7 @@ class TestNodepool(tests.DBTestCase): for subnode in node.subnodes: self.assertEqual(subnode.state, nodedb.READY) + @skip("Disabled for early v3 development") def test_subnode_deletion_success(self): """Test that subnodes are deleted with parent node""" configfile = self.setup_config('subnodes.yaml') @@ -204,6 +205,7 @@ class TestNodepool(tests.DBTestCase): s = session.getSubNode(subnode_id) self.assertIsNone(s) + @skip("Disabled for early v3 development") def test_node_az(self): """Test that an image and node are created with az specified""" configfile = self.setup_config('node_az.yaml') @@ -221,6 +223,7 @@ class TestNodepool(tests.DBTestCase): self.assertEqual(len(nodes), 1) self.assertEqual(nodes[0].az, 'az1') + @skip("Disabled for early v3 development") def test_node_ipv6(self): """Test that a node is created w/ or w/o ipv6 preferred flag""" configfile = self.setup_config('node_ipv6.yaml') @@ -255,6 +258,7 @@ class TestNodepool(tests.DBTestCase): self.assertEqual(len(nodes), 1) self.assertEqual(nodes[0].ip, 'fake') + @skip("Disabled for early v3 development") def test_node_delete_success(self): configfile = self.setup_config('node.yaml') pool = self.useNodepool(configfile, watermark_sleep=1) @@ -291,6 +295,7 @@ class TestNodepool(tests.DBTestCase): # Make sure our old node was deleted self.assertEqual(len(deleted_nodes), 0) + @skip("Disabled for early v3 development") def test_node_delete_failure(self): def fail_delete(self, name): raise RuntimeError('Fake Error') @@ -334,6 +339,7 @@ class TestNodepool(tests.DBTestCase): self.assertEqual(len(deleted_nodes), 1) self.assertEqual(node_id, deleted_nodes[0].id) + @skip("Disabled for early v3 development") def test_leaked_node(self): """Test that a leaked node is deleted""" configfile = self.setup_config('leaked_node.yaml') @@ -483,6 +489,7 @@ class TestNodepool(tests.DBTestCase): # should be second image built. self.assertEqual(images[0].id, 2) + @skip("Disabled for early v3 development") def test_job_start_event(self): """Test that job start marks node used""" configfile = self.setup_config('node.yaml') @@ -507,6 +514,7 @@ class TestNodepool(tests.DBTestCase): state=nodedb.USED) self.assertEqual(len(nodes), 1) + @skip("Disabled for early v3 development") def test_job_end_event(self): """Test that job end marks node delete""" configfile = self.setup_config('node.yaml') @@ -533,6 +541,7 @@ class TestNodepool(tests.DBTestCase): node = session.getNode(1) self.assertEqual(node, None) + @skip("Disabled for early v3 development") def _test_job_auto_hold(self, result): configfile = self.setup_config('node.yaml') pool = self.useNodepool(configfile, watermark_sleep=1) @@ -559,6 +568,7 @@ class TestNodepool(tests.DBTestCase): self.wait_for_threads() return pool + @skip("Disabled for early v3 development") def test_job_auto_hold_success(self): """Test that a successful job does not hold a node""" pool = self._test_job_auto_hold('SUCCESS') @@ -566,6 +576,7 @@ class TestNodepool(tests.DBTestCase): node = session.getNode(1) self.assertIsNone(node) + @skip("Disabled for early v3 development") def test_job_auto_hold_failure(self): """Test that a failed job automatically holds a node""" pool = self._test_job_auto_hold('FAILURE') @@ -573,6 +584,7 @@ class TestNodepool(tests.DBTestCase): node = session.getNode(1) self.assertEqual(node.state, nodedb.HOLD) + @skip("Disabled for early v3 development") def test_job_auto_hold_failure_max(self): """Test that a failed job automatically holds only one node""" pool = self._test_job_auto_hold('FAILURE') @@ -600,37 +612,3 @@ class TestNodepool(tests.DBTestCase): with pool.getDB().getSession() as session: node = session.getNode(2) self.assertEqual(node, None) - - -class TestGearClient(tests.DBTestCase): - def test_wait_for_completion(self): - wj = jobs.WatchableJob('test', 'test', 'test') - - def call_on_completed(): - time.sleep(.2) - wj.onCompleted() - - t = threading.Thread(target=call_on_completed) - t.start() - wj.waitForCompletion() - - def test_handle_disconnect(self): - class MyJob(jobs.WatchableJob): - def __init__(self, *args, **kwargs): - super(MyJob, self).__init__(*args, **kwargs) - self.disconnect_called = False - - def onDisconnect(self): - self.disconnect_called = True - super(MyJob, self).onDisconnect() - - client = nodepool.nodepool.GearmanClient() - client.addServer('localhost', self.gearman_server.port) - client.waitForServer() - - job = MyJob('test-job', '', '') - client.submitJob(job) - - self.gearman_server.shutdown() - job.waitForCompletion() - self.assertEqual(job.disconnect_called, True) diff --git a/nodepool/tests/test_webapp.py b/nodepool/tests/test_webapp.py index 9a2671385..586232c66 100644 --- a/nodepool/tests/test_webapp.py +++ b/nodepool/tests/test_webapp.py @@ -16,12 +16,14 @@ import logging import urllib2 +from unittest import skip from nodepool import tests class TestWebApp(tests.DBTestCase): log = logging.getLogger("nodepool.TestWebApp") + @skip("Disabled for early v3 development") def test_image_list(self): configfile = self.setup_config('node.yaml') pool = self.useNodepool(configfile, watermark_sleep=1) diff --git a/tools/check_devstack_plugin.sh b/tools/check_devstack_plugin.sh index 5c8ab135c..ba603daec 100755 --- a/tools/check_devstack_plugin.sh +++ b/tools/check_devstack_plugin.sh @@ -40,6 +40,8 @@ function waitfornode { done } +exit 0 + if [ $NODEPOOL_PAUSE_CENTOS_7_DIB = 'false' ]; then # check that image built waitforimage centos-7 From 8ce719b626fce8ec2d03563a860c8caacd29bda7 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Thu, 5 Jan 2017 13:12:04 -0500 Subject: [PATCH 007/309] Remove Gearman and ZMQ Sets up a shim for the new nodepool.NodePool.run() method that does not require any service except ZooKeeper and removes all references to Gearman/ZMQ. Change-Id: I452c24d631592f47eb3f4cbffb56f3252f36c298 --- devstack/plugin.sh | 8 - doc/source/configuration.rst | 23 -- doc/source/installation.rst | 48 +-- nodepool/cmd/config_validator.py | 6 - nodepool/config.py | 27 -- nodepool/jobs.py | 78 ---- nodepool/nodepool.py | 334 +++--------------- nodepool/tests/__init__.py | 122 ------- .../tests/fixtures/config_validate/good.yaml | 12 - .../fixtures/config_validate/yaml_error.yaml | 12 - nodepool/tests/fixtures/integration.yaml | 7 - nodepool/tests/fixtures/integration_osc.yaml | 7 - nodepool/tests/fixtures/leaked_node.yaml | 7 - nodepool/tests/fixtures/node.yaml | 7 - nodepool/tests/fixtures/node_az.yaml | 7 - nodepool/tests/fixtures/node_cmd.yaml | 7 - .../tests/fixtures/node_disabled_label.yaml | 7 - .../tests/fixtures/node_diskimage_fail.yaml | 7 - .../tests/fixtures/node_diskimage_pause.yaml | 7 - .../fixtures/node_image_upload_pause.yaml | 7 - nodepool/tests/fixtures/node_ipv6.yaml | 7 - nodepool/tests/fixtures/node_net_name.yaml | 7 - nodepool/tests/fixtures/node_two_image.yaml | 7 - .../tests/fixtures/node_two_image_remove.yaml | 7 - .../tests/fixtures/node_two_provider.yaml | 7 - .../fixtures/node_two_provider_remove.yaml | 7 - nodepool/tests/fixtures/node_upload_fail.yaml | 7 - nodepool/tests/fixtures/node_vhd.yaml | 7 - .../tests/fixtures/node_vhd_and_qcow2.yaml | 7 - nodepool/tests/fixtures/subnodes.yaml | 7 - nodepool/zk.py | 9 +- requirements.txt | 2 - tools/fake-dib.yaml | 6 - tools/fake-servers.py | 93 ----- tools/fake.yaml | 7 - tools/zmq-stream.py | 36 -- 36 files changed, 54 insertions(+), 909 deletions(-) delete mode 100644 nodepool/jobs.py delete mode 100644 tools/fake-servers.py delete mode 100644 tools/zmq-stream.py diff --git a/devstack/plugin.sh b/devstack/plugin.sh index d8456bfad..64c591b6a 100644 --- a/devstack/plugin.sh +++ b/devstack/plugin.sh @@ -184,15 +184,10 @@ zookeeper-servers: - host: localhost port: 2181 -gearman-servers: - - host: localhost - port: 8991 -zmq-publishers: [] # Need to have at least one target for node allocations, but # this does not need to be a jenkins target. targets: - name: dummy - assign-via-gearman: True cron: cleanup: '*/1 * * * *' @@ -419,9 +414,6 @@ function start_nodepool { export PATH=$NODEPOOL_INSTALL/bin:$PATH - # start gearman server - run_process geard "$NODEPOOL_INSTALL/bin/geard -p 8991 -d" - # run a fake statsd so we test stats sending paths export STATSD_HOST=localhost export STATSD_PORT=8125 diff --git a/doc/source/configuration.rst b/doc/source/configuration.rst index 98c7b39b7..2fc12d07a 100644 --- a/doc/source/configuration.rst +++ b/doc/source/configuration.rst @@ -112,29 +112,6 @@ and also indicates their default values:: cleanup: '27 */6 * * *' check: '*/15 * * * *' -zmq-publishers --------------- -Lists the ZeroMQ endpoints for the Jenkins masters. Nodepool uses -this to receive real-time notification that jobs are running on nodes -or are complete and nodes may be deleted. Example:: - - zmq-publishers: - - tcp://jenkins1.example.com:8888 - - tcp://jenkins2.example.com:8888 - -gearman-servers ---------------- -Lists the Zuul Gearman servers that should be consulted for real-time -demand. Nodepool will use information from these servers to determine -if additional nodes should be created to satisfy current demand. -Example:: - - gearman-servers: - - host: zuul.example.com - port: 4730 - -The ``port`` key is optional (default: 4730). - zookeeper-servers ----------------- Lists the ZooKeeper servers uses for coordinating information between diff --git a/doc/source/installation.rst b/doc/source/installation.rst index 7557b7c34..fc9f25cfc 100644 --- a/doc/source/installation.rst +++ b/doc/source/installation.rst @@ -3,51 +3,12 @@ Installation ============ -Nodepool consists of a set of long-running daemons which use an SQL -database, a ZooKeeper cluster, and communicates with Jenkins using -ZeroMQ. +Nodepool consists of a long-running daemon which uses ZooKeeper +for coordination with Zuul. External Requirements --------------------- -Jenkins -~~~~~~~ - -You should have a Jenkins server running with the `ZMQ Event Publisher -`_ -plugin installed (it is available in the Jenkins Update Center). Be -sure that the machine where you plan to run Nodepool can connect to -the ZMQ port specified by the plugin on your Jenkins master(s). - -Zuul -~~~~ - -If you plan to use Nodepool with Zuul (it is optional), you should -ensure that Nodepool can connect to the gearman port on your Zuul -server (TCP 4730 by default). This will allow Nodepool to respond to -current Zuul demand. If you elect not to connect Nodepool to Zuul, it -will still operate in a node-replacement mode. - -Database -~~~~~~~~ - -Nodepool requires an SQL server. MySQL with the InnoDB storage engine -is tested and recommended. PostgreSQL should work fine. Due to the -high number of concurrent connections from Nodepool, SQLite is not -recommended. When adding or deleting nodes, Nodepool will hold open a -database connection for each node. Be sure to configure the database -server to support at least a number of connections equal to twice the -number of nodes you expect to be in use at once. - -All that is necessary is that the database is created. Nodepool will -handle the schema by itself when it is run. - -MySQL Example:: - - CREATE USER 'nodepool'@'localhost' IDENTIFIED BY ''; - CREATE DATABASE nodepooldb; - GRANT ALL ON nodepooldb.* TO 'nodepool'@'localhost'; - ZooKeeper ~~~~~~~~~ @@ -88,11 +49,6 @@ Or install directly from a git checkout with:: pip install . -Note that some distributions provide a libzmq1 which does not support -RCVTIMEO. Removing this libzmq1 from the system libraries will ensure -pip compiles a libzmq1 with appropriate options for the version of -pyzmq used by nodepool. - Configuration ------------- diff --git a/nodepool/cmd/config_validator.py b/nodepool/cmd/config_validator.py index 52a591426..1b8b32f9f 100644 --- a/nodepool/cmd/config_validator.py +++ b/nodepool/cmd/config_validator.py @@ -94,7 +94,6 @@ class ConfigValidator: 'name': str, 'hostname': str, 'subnode-hostname': str, - 'assign-via-gearman': bool, 'jenkins': { 'url': str, 'user': str, @@ -117,11 +116,6 @@ class ConfigValidator: 'elements-dir': str, 'images-dir': str, 'dburi': str, - 'zmq-publishers': [str], - 'gearman-servers': [{ - 'host': str, - 'port': int, - }], 'zookeeper-servers': [{ 'host': str, 'port': int, diff --git a/nodepool/config.py b/nodepool/config.py index 9c2a5de4a..bb482225d 100644 --- a/nodepool/config.py +++ b/nodepool/config.py @@ -101,16 +101,6 @@ class Cron(ConfigValue): return "" % self.name -class ZMQPublisher(ConfigValue): - def __repr__(self): - return "" % self.name - - -class GearmanServer(ConfigValue): - def __repr__(self): - return "" % self.name - - class DiskImage(ConfigValue): def __repr__(self): return "" % self.name @@ -154,8 +144,6 @@ def loadConfig(config_path): newconfig.dburi = None newconfig.provider_managers = {} newconfig.jenkins_managers = {} - newconfig.zmq_publishers = {} - newconfig.gearman_servers = {} newconfig.zookeeper_servers = {} newconfig.diskimages = {} newconfig.crons = {} @@ -170,19 +158,6 @@ def loadConfig(config_path): c.job = None c.timespec = config.get('cron', {}).get(name, default) - for addr in config.get('zmq-publishers', []): - z = ZMQPublisher() - z.name = addr - z.listener = None - newconfig.zmq_publishers[z.name] = z - - for server in config.get('gearman-servers', []): - g = GearmanServer() - g.host = server['host'] - g.port = server.get('port', 4730) - g.name = g.host + '_' + str(g.port) - newconfig.gearman_servers[g.name] = g - for server in config.get('zookeeper-servers', []): z = zk.ZooKeeperConnectionConfig(server['host'], server.get('port', 2181), @@ -312,8 +287,6 @@ def loadConfig(config_path): t.jenkins_apikey = None t.jenkins_credentials_id = None - t.assign_via_gearman = target.get('assign-via-gearman', False) - t.hostname = target.get( 'hostname', '{label.name}-{provider.name}-{node_id}' diff --git a/nodepool/jobs.py b/nodepool/jobs.py deleted file mode 100644 index c1d6f1ab9..000000000 --- a/nodepool/jobs.py +++ /dev/null @@ -1,78 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import logging -import uuid -import threading - -import gear - - -class WatchableJob(gear.Job): - def __init__(self, *args, **kwargs): - super(WatchableJob, self).__init__(*args, **kwargs) - self._completion_handlers = [] - self._event = threading.Event() - - def _handleCompletion(self, mode=None): - self._event.set() - for handler in self._completion_handlers: - handler(self) - - def addCompletionHandler(self, handler): - self._completion_handlers.append(handler) - - def onCompleted(self): - self._handleCompletion() - - def onFailed(self): - self._handleCompletion() - - def onDisconnect(self): - self._handleCompletion() - - def onWorkStatus(self): - pass - - def waitForCompletion(self, timeout=None): - return self._event.wait(timeout) - - -class NodepoolJob(WatchableJob): - def __init__(self, job_name, job_data_obj, nodepool): - job_uuid = str(uuid.uuid4().hex) - job_data = json.dumps(job_data_obj) - super(NodepoolJob, self).__init__(job_name, job_data, job_uuid) - self.nodepool = nodepool - - def getDbSession(self): - return self.nodepool.getDB().getSession() - - -class NodeAssignmentJob(NodepoolJob): - log = logging.getLogger("jobs.NodeAssignmentJob") - - def __init__(self, node_id, target_name, data, nodepool): - self.node_id = node_id - job_name = 'node_assign:%s' % target_name - super(NodeAssignmentJob, self).__init__(job_name, data, nodepool) - - -class NodeRevokeJob(NodepoolJob): - log = logging.getLogger("jobs.NodeRevokeJob") - - def __init__(self, node_id, manager_name, data, nodepool): - self.node_id = node_id - job_name = 'node_revoke:%s' % manager_name - super(NodeRevokeJob, self).__init__(job_name, data, nodepool) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 1157a9b23..f6d4c6129 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -18,7 +18,6 @@ import apscheduler.schedulers.background import apscheduler.triggers.cron -import gear import json import logging import os @@ -29,7 +28,6 @@ import random import socket import threading import time -import zmq import allocation import jenkins_manager @@ -40,7 +38,6 @@ import provider_manager import stats import config as nodepool_config -import jobs import zk MINS = 60 @@ -57,6 +54,8 @@ IMAGE_CLEANUP = 8 * HOURS # When to start deleting an image that is not # READY or is not the current or previous image DELETE_DELAY = 1 * MINS # Delay before deleting a node that has completed # its job. +SUSPEND_WAIT_TIME = 30 # How long to wait between checks for ZooKeeper + # connectivity if it disappears. class LaunchNodepoolException(Exception): @@ -172,174 +171,6 @@ class NodeCompleteThread(threading.Thread): self.nodepool.deleteNode(node.id) -class NodeUpdateListener(threading.Thread): - log = logging.getLogger("nodepool.NodeUpdateListener") - - def __init__(self, nodepool, addr): - threading.Thread.__init__(self, name='NodeUpdateListener') - self.nodepool = nodepool - self.socket = self.nodepool.zmq_context.socket(zmq.SUB) - self.socket.RCVTIMEO = 1000 - event_filter = b"" - self.socket.setsockopt(zmq.SUBSCRIBE, event_filter) - self.socket.connect(addr) - self._stopped = False - - def run(self): - while not self._stopped: - try: - m = self.socket.recv().decode('utf-8') - except zmq.error.Again: - continue - try: - topic, data = m.split(None, 1) - self.handleEvent(topic, data) - except Exception: - self.log.exception("Exception handling job:") - - def stop(self): - self._stopped = True - - def handleEvent(self, topic, data): - self.log.debug("Received: %s %s" % (topic, data)) - args = json.loads(data) - build = args['build'] - if 'node_name' not in build: - return - jobname = args['name'] - nodename = args['build']['node_name'] - if topic == 'onStarted': - self.handleStartPhase(nodename, jobname) - elif topic == 'onCompleted': - pass - elif topic == 'onFinalized': - result = args['build'].get('status') - params = args['build'].get('parameters') - if params: - branch = params.get('ZUUL_BRANCH', 'unknown_branch') - else: - branch = 'unknown_branch' - self.handleCompletePhase(nodename, jobname, result, branch) - else: - raise Exception("Received job for unhandled phase: %s" % - topic) - - def handleStartPhase(self, nodename, jobname): - with self.nodepool.getDB().getSession() as session: - node = session.getNodeByNodename(nodename) - if not node: - self.log.debug("Unable to find node with nodename: %s" % - nodename) - return - - target = self.nodepool.config.targets[node.target_name] - if jobname == target.jenkins_test_job: - self.log.debug("Test job for node id: %s started" % node.id) - return - - # Preserve the HOLD state even if a job starts on the node. - if node.state != nodedb.HOLD: - self.log.info("Setting node id: %s to USED" % node.id) - node.state = nodedb.USED - self.nodepool.updateStats(session, node.provider_name) - - def handleCompletePhase(self, nodename, jobname, result, branch): - t = NodeCompleteThread(self.nodepool, nodename, jobname, result, - branch) - t.start() - - -class GearmanClient(gear.Client): - def __init__(self): - super(GearmanClient, self).__init__(client_id='nodepool') - self.__log = logging.getLogger("nodepool.GearmanClient") - - def getNeededWorkers(self): - needed_workers = {} - job_worker_map = {} - unspecified_jobs = {} - for connection in self.active_connections: - try: - req = gear.StatusAdminRequest() - connection.sendAdminRequest(req, timeout=300) - except Exception: - self.__log.exception("Exception while listing functions") - self._lostConnection(connection) - continue - for line in req.response.split('\n'): - parts = [x.strip() for x in line.split('\t')] - # parts[0] - function name - # parts[1] - total jobs queued (including building) - # parts[2] - jobs building - # parts[3] - workers registered - if not parts or parts[0] == '.': - continue - if not parts[0].startswith('build:'): - continue - function = parts[0][len('build:'):] - # total jobs in queue (including building jobs) - # NOTE(jhesketh): Jobs that are being built are accounted for - # in the demand algorithm by subtracting the running nodes. - # If there are foreign (to nodepool) workers accepting jobs - # the demand will be higher than actually required. However - # better to have too many than too few and if you have a - # foreign worker this may be desired. - try: - queued = int(parts[1]) - except ValueError as e: - self.__log.warn( - 'Server returned non-integer value in status. (%s)' % - str(e)) - queued = 0 - if queued > 0: - self.__log.debug("Function: %s queued: %s" % (function, - queued)) - if ':' in function: - fparts = function.split(':') - # fparts[0] - function name - # fparts[1] - target node [type] - job = fparts[-2] - worker = fparts[-1] - workers = job_worker_map.get(job, []) - workers.append(worker) - job_worker_map[job] = workers - if queued > 0: - needed_workers[worker] = ( - needed_workers.get(worker, 0) + queued) - elif queued > 0: - job = function - unspecified_jobs[job] = (unspecified_jobs.get(job, 0) + - queued) - for job, queued in unspecified_jobs.items(): - workers = job_worker_map.get(job) - if not workers: - continue - worker = workers[0] - needed_workers[worker] = (needed_workers.get(worker, 0) + - queued) - return needed_workers - - def handleWorkComplete(self, packet): - job = super(GearmanClient, self).handleWorkComplete(packet) - job.onCompleted() - - def handleWorkFail(self, packet): - job = super(GearmanClient, self).handleWorkFail(packet) - job.onFailed() - - def handleWorkException(self, packet): - job = super(GearmanClient, self).handleWorkException(packet) - job.onFailed() - - def handleDisconnect(self, job): - super(GearmanClient, self).handleDisconnect(job) - job.onDisconnect() - - def handleWorkStatus(self, packet): - job = super(GearmanClient, self).handleWorkStatus(packet) - job.onWorkStatus() - - class InstanceDeleter(threading.Thread): log = logging.getLogger("nodepool.InstanceDeleter") @@ -569,10 +400,6 @@ class NodeLauncher(threading.Thread): self.createJenkinsNode() self.log.info("Node id: %s added to jenkins" % self.node.id) - if self.target.assign_via_gearman: - self.log.info("Node id: %s assigning via gearman" % self.node.id) - self.assignViaGearman() - return dt def createJenkinsNode(self): @@ -597,24 +424,6 @@ class NodeLauncher(threading.Thread): params = dict(NODE=self.node.nodename) jenkins.startBuild(self.target.jenkins_test_job, params) - def assignViaGearman(self): - args = dict(name=self.node.nodename, - host=self.node.ip, - description='Dynamic single use %s node' % self.label.name, - labels=self.label.name, - root=self.image.user_home) - job = jobs.NodeAssignmentJob(self.node.id, self.node.target_name, - args, self.nodepool) - self.nodepool.gearman_client.submitJob(job, timeout=300) - job.waitForCompletion() - self.log.info("Node id: %s received %s from assignment" % ( - self.node.id, job.data)) - if job.failure: - raise Exception("Node id: %s received job failure on assignment" % - self.node.id) - data = json.loads(job.data[-1]) - self.node.manager_name = data['manager'] - def writeNodepoolInfo(self, nodelist): key = paramiko.RSAKey.generate(2048) public_key = key.get_name() + ' ' + key.get_base64() @@ -862,6 +671,20 @@ class SubNodeLauncher(threading.Thread): return dt +class RequestWorker(threading.Thread): + log = logging.getLogger("nodepool.RequestWorker") + + def __init__(self, request, zk): + threading.Thread.__init__( + self, name='RequestWorker for %s' % request.id + ) + self.request = request + self.zk = zk + + def run(self): + self.log.info("Handling node request %s" % self.request.id) + + class NodePool(threading.Thread): log = logging.getLogger("nodepool.NodePool") @@ -875,8 +698,6 @@ class NodePool(threading.Thread): self.watermark_sleep = watermark_sleep self._stopped = False self.config = None - self.zmq_context = None - self.gearman_client = None self.apsched = None self.zk = None self.statsd = stats.get_client() @@ -895,16 +716,9 @@ class NodePool(threading.Thread): self._wake_condition.notify() self._wake_condition.release() if self.config: - for z in self.config.zmq_publishers.values(): - z.listener.stop() - z.listener.join() provider_manager.ProviderManager.stopProviders(self.config) - if self.zmq_context: - self.zmq_context.destroy() if self.apsched and self.apsched.running: self.apsched.shutdown() - if self.gearman_client: - self.gearman_client.shutdown() self.log.debug("finished stopping") def loadConfig(self): @@ -913,12 +727,6 @@ class NodePool(threading.Thread): nodepool_config.loadSecureConfig(config, self.securefile) return config - def reconfigureDatabase(self, config): - if (not self.config) or config.dburi != self.config.dburi: - config.db = nodedb.NodeDatabase(config.dburi) - else: - config.db = self.config.db - def reconfigureManagers(self, config, check_targets=True): provider_manager.ProviderManager.reconfigure(self.config, config) @@ -989,54 +797,6 @@ class NodePool(threading.Thread): else: c.job = self.config.crons[c.name].job - def reconfigureUpdateListeners(self, config): - if self.no_deletes: - return - if self.config: - running = set(self.config.zmq_publishers.keys()) - else: - running = set() - - configured = set(config.zmq_publishers.keys()) - if running == configured: - self.log.debug("ZMQ Listeners do not need to be updated") - if self.config: - config.zmq_publishers = self.config.zmq_publishers - return - - if self.zmq_context: - self.log.debug("Stopping listeners") - self.zmq_context.destroy() - self.zmq_context = zmq.Context() - for z in config.zmq_publishers.values(): - self.log.debug("Starting listener for %s" % z.name) - z.listener = NodeUpdateListener(self, z.name) - z.listener.start() - - def reconfigureGearmanClient(self, config): - if self.config: - running = set(self.config.gearman_servers.keys()) - else: - running = set() - - configured = set(config.gearman_servers.keys()) - if running == configured: - self.log.debug("Gearman client does not need to be updated") - if self.config: - config.gearman_servers = self.config.gearman_servers - return - - if self.gearman_client: - self.log.debug("Stopping gearman client") - self.gearman_client.shutdown() - self.gearman_client = None - if configured: - self.gearman_client = GearmanClient() - for g in config.gearman_servers.values(): - self.log.debug("Adding gearman server %s" % g.name) - self.gearman_client.addServer(g.host, g.port) - self.gearman_client.waitForServer() - def reconfigureZooKeeper(self, config): if self.config: running = self.config.zookeeper_servers.values() @@ -1078,10 +838,7 @@ class NodePool(threading.Thread): def getNeededNodes(self, session, allocation_history): self.log.debug("Beginning node launch calculation") # Get the current demand for nodes. - if self.gearman_client: - label_demand = self.gearman_client.getNeededWorkers() - else: - label_demand = {} + label_demand = {} for name, demand in label_demand.items(): self.log.debug(" Demand from gearman: %s: %s" % (name, demand)) @@ -1245,45 +1002,49 @@ class NodePool(threading.Thread): def updateConfig(self): config = self.loadConfig() - self.reconfigureDatabase(config) self.reconfigureZooKeeper(config) self.reconfigureManagers(config) - self.reconfigureUpdateListeners(config) - self.reconfigureGearmanClient(config) self.reconfigureCrons(config) self.setConfig(config) - def startup(self): - self.updateConfig() - self.zk.registerLauncher(self.launcher_id) - - # Currently nodepool can not resume building a node or image - # after a restart. To clean up, mark all building node and - # images for deletion when the daemon starts. - with self.getDB().getSession() as session: - for node in session.getNodes(state=nodedb.BUILDING): - self.log.info("Setting building node id: %s to delete " - "on startup" % node.id) - node.state = nodedb.DELETE - def run(self): - try: - self.startup() - except Exception: - self.log.exception("Exception in startup:") + ''' + Start point for the NodePool thread. + ''' allocation_history = allocation.AllocationHistory() + while not self._stopped: try: self.updateConfig() - with self.getDB().getSession() as session: - self._run(session, allocation_history) + + # Don't do work if we've lost communication with the ZK cluster + while self.zk and (self.zk.suspended or self.zk.lost): + self.log.info("ZooKeeper suspended. Waiting") + time.sleep(SUSPEND_WAIT_TIME) + + # Make sure we're always registered with ZK + self.zk.registerLauncher(self.launcher_id) + self._run(allocation_history) except Exception: self.log.exception("Exception in main loop:") + self._wake_condition.acquire() self._wake_condition.wait(self.watermark_sleep) self._wake_condition.release() - def _run(self, session, allocation_history): + def _run(self, allocation_history): + if self.no_launches: + return + + for req_id in self.zk.getNodeRequests(): + request = self.zk.getNodeRequest(req_id) + if request.state != zk.REQUESTED: + continue + + worker = RequestWorker(request, self.zk) + worker.start() + + def _run_OLD(self, session, allocation_history): if self.no_launches: return # Make up the subnode deficit first to make sure that an @@ -1380,13 +1141,6 @@ class NodePool(threading.Thread): finally: self._delete_threads_lock.release() - def revokeAssignedNode(self, node): - args = dict(name=node.nodename) - job = jobs.NodeRevokeJob(node.id, node.manager_name, - args, self) - self.gearman_client.submitJob(job, timeout=300) - # Do not wait for completion in case the manager is offline - def _deleteNode(self, session, node): self.log.debug("Deleting node id: %s which has been in %s " "state for %s hours" % diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py index 122eb8fd9..f93cd5d98 100644 --- a/nodepool/tests/__init__.py +++ b/nodepool/tests/__init__.py @@ -15,13 +15,11 @@ """Common utilities used in testing""" -import errno import glob import logging import os import pymysql import random -import re import string import subprocess import threading @@ -30,7 +28,6 @@ import time import uuid import fixtures -import gear import lockfile import kazoo.client import testtools @@ -46,74 +43,6 @@ class LoggingPopen(subprocess.Popen): pass -class FakeGearmanServer(gear.Server): - def __init__(self, port=0): - self.hold_jobs_in_queue = False - super(FakeGearmanServer, self).__init__(port) - - def getJobForConnection(self, connection, peek=False): - for queue in [self.high_queue, self.normal_queue, self.low_queue]: - for job in queue: - if not hasattr(job, 'waiting'): - if job.name.startswith('build:'): - job.waiting = self.hold_jobs_in_queue - else: - job.waiting = False - if job.waiting: - continue - if job.name in connection.functions: - if not peek: - queue.remove(job) - connection.related_jobs[job.handle] = job - job.worker_connection = connection - job.running = True - return job - return None - - def release(self, regex=None): - released = False - qlen = (len(self.high_queue) + len(self.normal_queue) + - len(self.low_queue)) - self.log.debug("releasing queued job %s (%s)" % (regex, qlen)) - for job in self.getQueue(): - cmd, name = job.name.split(':') - if cmd != 'build': - continue - if not regex or re.match(regex, name): - self.log.debug("releasing queued job %s" % - job.unique) - job.waiting = False - released = True - else: - self.log.debug("not releasing queued job %s" % - job.unique) - if released: - self.wakeConnections() - qlen = (len(self.high_queue) + len(self.normal_queue) + - len(self.low_queue)) - self.log.debug("done releasing queued jobs %s (%s)" % (regex, qlen)) - - -class GearmanServerFixture(fixtures.Fixture): - def __init__(self, port=0): - self._port = port - - def setUp(self): - super(GearmanServerFixture, self).setUp() - self.gearman_server = FakeGearmanServer(self._port) - self.addCleanup(self.shutdownGearman) - - def shutdownGearman(self): - #TODO:greghaynes remove try once gear client protects against this - try: - self.gearman_server.shutdown() - except OSError as e: - if e.errno == errno.EBADF: - pass - else: - raise - - class ZookeeperServerFixture(fixtures.Fixture): def _setUp(self): zk_host = os.environ.get('NODEPOOL_ZK_HOST', 'localhost') @@ -171,37 +100,6 @@ class ChrootedKazooFixture(fixtures.Fixture): _tmp_client.close() -class GearmanClient(gear.Client): - def __init__(self): - super(GearmanClient, self).__init__(client_id='test_client') - self.__log = logging.getLogger("tests.GearmanClient") - - def get_queued_image_jobs(self): - 'Count the number of image-build and upload jobs queued.' - queued = 0 - for connection in self.active_connections: - try: - req = gear.StatusAdminRequest() - connection.sendAdminRequest(req) - except Exception: - self.__log.exception("Exception while listing functions") - self._lostConnection(connection) - continue - for line in req.response.split('\n'): - parts = [x.strip() for x in line.split('\t')] - # parts[0] - function name - # parts[1] - total jobs queued (including building) - # parts[2] - jobs building - # parts[3] - workers registered - if not parts or parts[0] == '.': - continue - if (not parts[0].startswith('image-build:') and - not parts[0].startswith('image-upload:')): - continue - queued += int(parts[1]) - return queued - - class BaseTestCase(testtools.TestCase): def setUp(self): super(BaseTestCase, self).setUp() @@ -265,8 +163,6 @@ class BaseTestCase(testtools.TestCase): 'NodePool', 'NodePool Builder', 'NodeUpdateListener', - 'Gearman client connect', - 'Gearman client poll', 'fake-provider', 'fake-provider1', 'fake-provider2', @@ -397,11 +293,6 @@ class DBTestCase(BaseTestCase): self.useFixture(f) self.dburi = f.dburi self.secure_conf = self._setup_secure() - - gearman_fixture = GearmanServerFixture() - self.useFixture(gearman_fixture) - self.gearman_server = gearman_fixture.gearman_server - self.setupZK() def setup_config(self, filename, images_dir=None): @@ -414,7 +305,6 @@ class DBTestCase(BaseTestCase): with open(configfile) as conf_fd: config = conf_fd.read() os.write(fd, config.format(images_dir=images_dir.path, - gearman_port=self.gearman_server.port, zookeeper_host=self.zookeeper_host, zookeeper_port=self.zookeeper_port, zookeeper_chroot=self.zookeeper_chroot)) @@ -540,18 +430,6 @@ class DBTestCase(BaseTestCase): time.sleep(1) self.wait_for_threads() - def waitForJobs(self): - # XXX:greghaynes - There is a very narrow race here where nodepool - # is who actually updates the database so this may return before the - # image rows are updated. - client = GearmanClient() - client.addServer('localhost', self.gearman_server.port) - client.waitForServer() - - while client.get_queued_image_jobs() > 0: - time.sleep(.2) - client.shutdown() - def useNodepool(self, *args, **kwargs): args = (self.secure_conf,) + args pool = nodepool.NodePool(*args, **kwargs) diff --git a/nodepool/tests/fixtures/config_validate/good.yaml b/nodepool/tests/fixtures/config_validate/good.yaml index b7b12e7d0..623a2f5ca 100644 --- a/nodepool/tests/fixtures/config_validate/good.yaml +++ b/nodepool/tests/fixtures/config_validate/good.yaml @@ -5,18 +5,6 @@ cron: cleanup: '*/1 * * * *' check: '*/15 * * * *' -zmq-publishers: - - tcp://jenkins01.openstack.org:8888 - - tcp://jenkins02.openstack.org:8888 - - tcp://jenkins03.openstack.org:8888 - - tcp://jenkins04.openstack.org:8888 - - tcp://jenkins05.openstack.org:8888 - - tcp://jenkins06.openstack.org:8888 - - tcp://jenkins07.openstack.org:8888 - -gearman-servers: - - host: zuul.openstack.org - zookeeper-servers: - host: zk1.openstack.org port: 2181 diff --git a/nodepool/tests/fixtures/config_validate/yaml_error.yaml b/nodepool/tests/fixtures/config_validate/yaml_error.yaml index 2b8a0bf70..08dd9626c 100644 --- a/nodepool/tests/fixtures/config_validate/yaml_error.yaml +++ b/nodepool/tests/fixtures/config_validate/yaml_error.yaml @@ -5,18 +5,6 @@ cron: cleanup: '*/1 * * * *' check: '*/15 * * * *' -zmq-publishers: - - tcp://jenkins01.openstack.org:8888 - - tcp://jenkins02.openstack.org:8888 - - tcp://jenkins03.openstack.org:8888 - - tcp://jenkins04.openstack.org:8888 - - tcp://jenkins05.openstack.org:8888 - - tcp://jenkins06.openstack.org:8888 - - tcp://jenkins07.openstack.org:8888 - -gearman-servers: - - host: zuul.openstack.org - zookeeper-servers: - host: zk1.openstack.org port: 2181 diff --git a/nodepool/tests/fixtures/integration.yaml b/nodepool/tests/fixtures/integration.yaml index 0a6e4c926..400695a8e 100644 --- a/nodepool/tests/fixtures/integration.yaml +++ b/nodepool/tests/fixtures/integration.yaml @@ -4,13 +4,6 @@ cron: check: '*/15 * * * *' cleanup: '*/1 * * * *' -zmq-publishers: - - tcp://localhost:8881 - -gearman-servers: - - host: localhost - port: {gearman_port} - zookeeper-servers: - host: localhost diff --git a/nodepool/tests/fixtures/integration_osc.yaml b/nodepool/tests/fixtures/integration_osc.yaml index 3222a960d..ac0c955f6 100644 --- a/nodepool/tests/fixtures/integration_osc.yaml +++ b/nodepool/tests/fixtures/integration_osc.yaml @@ -4,13 +4,6 @@ cron: check: '*/15 * * * *' cleanup: '*/1 * * * *' -zmq-publishers: - - tcp://localhost:8881 - -gearman-servers: - - host: localhost - port: {gearman_port} - zookeeper-servers: - host: localhost diff --git a/nodepool/tests/fixtures/leaked_node.yaml b/nodepool/tests/fixtures/leaked_node.yaml index e84cf2d99..d54ae71d0 100644 --- a/nodepool/tests/fixtures/leaked_node.yaml +++ b/nodepool/tests/fixtures/leaked_node.yaml @@ -5,13 +5,6 @@ cron: check: '*/15 * * * *' cleanup: '* * * * * *' -zmq-publishers: - - tcp://localhost:8881 - -gearman-servers: - - host: localhost - port: {gearman_port} - zookeeper-servers: - host: {zookeeper_host} port: {zookeeper_port} diff --git a/nodepool/tests/fixtures/node.yaml b/nodepool/tests/fixtures/node.yaml index 6636d3a2d..7e62d53c7 100644 --- a/nodepool/tests/fixtures/node.yaml +++ b/nodepool/tests/fixtures/node.yaml @@ -5,13 +5,6 @@ cron: check: '*/15 * * * *' cleanup: '*/1 * * * *' -zmq-publishers: - - tcp://localhost:8881 - -gearman-servers: - - host: localhost - port: {gearman_port} - zookeeper-servers: - host: {zookeeper_host} port: {zookeeper_port} diff --git a/nodepool/tests/fixtures/node_az.yaml b/nodepool/tests/fixtures/node_az.yaml index b78e67d08..cf1dd2cb4 100644 --- a/nodepool/tests/fixtures/node_az.yaml +++ b/nodepool/tests/fixtures/node_az.yaml @@ -5,13 +5,6 @@ cron: check: '*/15 * * * *' cleanup: '*/1 * * * *' -zmq-publishers: - - tcp://localhost:8881 - -gearman-servers: - - host: localhost - port: {gearman_port} - zookeeper-servers: - host: {zookeeper_host} port: {zookeeper_port} diff --git a/nodepool/tests/fixtures/node_cmd.yaml b/nodepool/tests/fixtures/node_cmd.yaml index 3c18b82bc..bffba1c39 100644 --- a/nodepool/tests/fixtures/node_cmd.yaml +++ b/nodepool/tests/fixtures/node_cmd.yaml @@ -4,13 +4,6 @@ cron: check: '*/15 * * * *' cleanup: '*/1 * * * *' -zmq-publishers: - - tcp://localhost:8881 - -gearman-servers: - - host: localhost - port: {gearman_port} - zookeeper-servers: - host: {zookeeper_host} port: {zookeeper_port} diff --git a/nodepool/tests/fixtures/node_disabled_label.yaml b/nodepool/tests/fixtures/node_disabled_label.yaml index 6bdc3040a..646ed14b9 100644 --- a/nodepool/tests/fixtures/node_disabled_label.yaml +++ b/nodepool/tests/fixtures/node_disabled_label.yaml @@ -5,13 +5,6 @@ cron: check: '*/15 * * * *' cleanup: '*/1 * * * *' -zmq-publishers: - - tcp://localhost:8881 - -gearman-servers: - - host: localhost - port: {gearman_port} - zookeeper-servers: - host: {zookeeper_host} port: {zookeeper_port} diff --git a/nodepool/tests/fixtures/node_diskimage_fail.yaml b/nodepool/tests/fixtures/node_diskimage_fail.yaml index 0787d62c8..08c956198 100644 --- a/nodepool/tests/fixtures/node_diskimage_fail.yaml +++ b/nodepool/tests/fixtures/node_diskimage_fail.yaml @@ -5,13 +5,6 @@ cron: check: '*/15 * * * *' cleanup: '*/1 * * * *' -zmq-publishers: - - tcp://localhost:8881 - -gearman-servers: - - host: localhost - port: {gearman_port} - zookeeper-servers: - host: {zookeeper_host} port: {zookeeper_port} diff --git a/nodepool/tests/fixtures/node_diskimage_pause.yaml b/nodepool/tests/fixtures/node_diskimage_pause.yaml index a8a8e1861..87dbd5735 100644 --- a/nodepool/tests/fixtures/node_diskimage_pause.yaml +++ b/nodepool/tests/fixtures/node_diskimage_pause.yaml @@ -5,13 +5,6 @@ cron: check: '*/15 * * * *' cleanup: '*/1 * * * *' -zmq-publishers: - - tcp://localhost:8881 - -gearman-servers: - - host: localhost - port: {gearman_port} - zookeeper-servers: - host: {zookeeper_host} port: {zookeeper_port} diff --git a/nodepool/tests/fixtures/node_image_upload_pause.yaml b/nodepool/tests/fixtures/node_image_upload_pause.yaml index 0232e852d..e22481ff5 100644 --- a/nodepool/tests/fixtures/node_image_upload_pause.yaml +++ b/nodepool/tests/fixtures/node_image_upload_pause.yaml @@ -5,13 +5,6 @@ cron: check: '*/15 * * * *' cleanup: '*/1 * * * *' -zmq-publishers: - - tcp://localhost:8881 - -gearman-servers: - - host: localhost - port: {gearman_port} - zookeeper-servers: - host: {zookeeper_host} port: {zookeeper_port} diff --git a/nodepool/tests/fixtures/node_ipv6.yaml b/nodepool/tests/fixtures/node_ipv6.yaml index 2cd1c6e8c..dc4ecd6ef 100644 --- a/nodepool/tests/fixtures/node_ipv6.yaml +++ b/nodepool/tests/fixtures/node_ipv6.yaml @@ -5,13 +5,6 @@ cron: check: '*/15 * * * *' cleanup: '*/1 * * * *' -zmq-publishers: - - tcp://localhost:8881 - -gearman-servers: - - host: localhost - port: {gearman_port} - zookeeper-servers: - host: {zookeeper_host} port: {zookeeper_port} diff --git a/nodepool/tests/fixtures/node_net_name.yaml b/nodepool/tests/fixtures/node_net_name.yaml index 8af31f150..61008c4d5 100644 --- a/nodepool/tests/fixtures/node_net_name.yaml +++ b/nodepool/tests/fixtures/node_net_name.yaml @@ -5,13 +5,6 @@ cron: check: '*/15 * * * *' cleanup: '*/1 * * * *' -zmq-publishers: - - tcp://localhost:8881 - -gearman-servers: - - host: localhost - port: {gearman_port} - zookeeper-servers: - host: {zookeeper_host} port: {zookeeper_port} diff --git a/nodepool/tests/fixtures/node_two_image.yaml b/nodepool/tests/fixtures/node_two_image.yaml index f55d8ffc4..82c6872d9 100644 --- a/nodepool/tests/fixtures/node_two_image.yaml +++ b/nodepool/tests/fixtures/node_two_image.yaml @@ -5,13 +5,6 @@ cron: check: '*/15 * * * *' cleanup: '*/1 * * * *' -zmq-publishers: - - tcp://localhost:8881 - -gearman-servers: - - host: localhost - port: {gearman_port} - zookeeper-servers: - host: {zookeeper_host} port: {zookeeper_port} diff --git a/nodepool/tests/fixtures/node_two_image_remove.yaml b/nodepool/tests/fixtures/node_two_image_remove.yaml index 6636d3a2d..7e62d53c7 100644 --- a/nodepool/tests/fixtures/node_two_image_remove.yaml +++ b/nodepool/tests/fixtures/node_two_image_remove.yaml @@ -5,13 +5,6 @@ cron: check: '*/15 * * * *' cleanup: '*/1 * * * *' -zmq-publishers: - - tcp://localhost:8881 - -gearman-servers: - - host: localhost - port: {gearman_port} - zookeeper-servers: - host: {zookeeper_host} port: {zookeeper_port} diff --git a/nodepool/tests/fixtures/node_two_provider.yaml b/nodepool/tests/fixtures/node_two_provider.yaml index 849c360a5..8a7fbb135 100644 --- a/nodepool/tests/fixtures/node_two_provider.yaml +++ b/nodepool/tests/fixtures/node_two_provider.yaml @@ -5,13 +5,6 @@ cron: check: '*/15 * * * *' cleanup: '*/1 * * * *' -zmq-publishers: - - tcp://localhost:8881 - -gearman-servers: - - host: localhost - port: {gearman_port} - zookeeper-servers: - host: {zookeeper_host} port: {zookeeper_port} diff --git a/nodepool/tests/fixtures/node_two_provider_remove.yaml b/nodepool/tests/fixtures/node_two_provider_remove.yaml index 85310516e..d48d8d550 100644 --- a/nodepool/tests/fixtures/node_two_provider_remove.yaml +++ b/nodepool/tests/fixtures/node_two_provider_remove.yaml @@ -5,13 +5,6 @@ cron: check: '*/15 * * * *' cleanup: '*/1 * * * *' -zmq-publishers: - - tcp://localhost:8881 - -gearman-servers: - - host: localhost - port: {gearman_port} - zookeeper-servers: - host: {zookeeper_host} port: {zookeeper_port} diff --git a/nodepool/tests/fixtures/node_upload_fail.yaml b/nodepool/tests/fixtures/node_upload_fail.yaml index 799f84d0b..ba3d961e3 100644 --- a/nodepool/tests/fixtures/node_upload_fail.yaml +++ b/nodepool/tests/fixtures/node_upload_fail.yaml @@ -5,13 +5,6 @@ cron: check: '*/15 * * * *' cleanup: '*/1 * * * *' -zmq-publishers: - - tcp://localhost:8881 - -gearman-servers: - - host: localhost - port: {gearman_port} - zookeeper-servers: - host: {zookeeper_host} port: {zookeeper_port} diff --git a/nodepool/tests/fixtures/node_vhd.yaml b/nodepool/tests/fixtures/node_vhd.yaml index 12bfd34b8..3676f3516 100644 --- a/nodepool/tests/fixtures/node_vhd.yaml +++ b/nodepool/tests/fixtures/node_vhd.yaml @@ -5,13 +5,6 @@ cron: check: '*/15 * * * *' cleanup: '*/1 * * * *' -zmq-publishers: - - tcp://localhost:8881 - -gearman-servers: - - host: localhost - port: {gearman_port} - zookeeper-servers: - host: {zookeeper_host} port: {zookeeper_port} diff --git a/nodepool/tests/fixtures/node_vhd_and_qcow2.yaml b/nodepool/tests/fixtures/node_vhd_and_qcow2.yaml index b2b28e15f..a1e9a569f 100644 --- a/nodepool/tests/fixtures/node_vhd_and_qcow2.yaml +++ b/nodepool/tests/fixtures/node_vhd_and_qcow2.yaml @@ -5,13 +5,6 @@ cron: check: '*/15 * * * *' cleanup: '*/1 * * * *' -zmq-publishers: - - tcp://localhost:8881 - -gearman-servers: - - host: localhost - port: {gearman_port} - zookeeper-servers: - host: {zookeeper_host} port: {zookeeper_port} diff --git a/nodepool/tests/fixtures/subnodes.yaml b/nodepool/tests/fixtures/subnodes.yaml index d076178c8..53c9ff8b3 100644 --- a/nodepool/tests/fixtures/subnodes.yaml +++ b/nodepool/tests/fixtures/subnodes.yaml @@ -5,13 +5,6 @@ cron: check: '*/15 * * * *' cleanup: '*/1 * * * *' -zmq-publishers: - - tcp://localhost:8881 - -gearman-servers: - - host: localhost - port: {gearman_port} - zookeeper-servers: - host: {zookeeper_host} port: {zookeeper_port} diff --git a/nodepool/zk.py b/nodepool/zk.py index 749cdd3aa..d59d2129a 100644 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -35,8 +35,15 @@ READY = 'ready' DELETING = 'deleting' # The build failed. FAILED = 'failed' +# Node request is submitted/unhandled. +REQUESTED = 'requested' +# Node request has been processed successfully. +FULFILLED = 'fulfilled' +# Node request is being worked. +PENDING = 'pending' -STATES = set([BUILDING, UPLOADING, READY, DELETING, FAILED]) +STATES = set([BUILDING, UPLOADING, READY, DELETING, FAILED, + REQUESTED, FULFILLED, PENDING]) class ZooKeeperConnectionConfig(object): ''' diff --git a/requirements.txt b/requirements.txt index d5658afcd..324e4e45f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ pbr>=1.3 -gear PyYAML python-jenkins paramiko>1.11.6,<2.0.0 @@ -9,7 +8,6 @@ extras statsd>=3.0 apscheduler>=3.0 sqlalchemy>=0.8.2,<1.1.0 -pyzmq>=13.1.0 PyMySQL PrettyTable>=0.6,<0.8 # shade has a looser requirement on six than nodepool, so install six first diff --git a/tools/fake-dib.yaml b/tools/fake-dib.yaml index 120994f9e..bb02b7108 100644 --- a/tools/fake-dib.yaml +++ b/tools/fake-dib.yaml @@ -5,12 +5,6 @@ cron: check: '*/15 * * * *' cleanup: '*/1 * * * *' -zmq-publishers: - - tcp://localhost:8881 - -gearman-servers: - - host: localhost - diskimages: - name: fake-dib-image elements: diff --git a/tools/fake-servers.py b/tools/fake-servers.py deleted file mode 100644 index 2b93e85b8..000000000 --- a/tools/fake-servers.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2013 Hewlett-Packard Development Company, L.P. -# Copyright 2011-2013 OpenStack Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -# A test script to stand in for a zeromq enabled jenkins. It sends zmq -# events that simulate the jenkins node lifecycle. -# -# Usage: -# zmq-server.py start HOSTNAME -# zmq-server.py complete HOSTNAME - -import gear -import json -import logging -import select -import socket -import threading -import zmq - -class MyGearmanServer(gear.Server): - def handleStatus(self, request): - request.connection.conn.send(("build:fake_job\t%s\t0\t0\n" % - self._count).encode('utf8')) - request.connection.conn.send(("build:fake_job:devstack-precise\t%s\t0\t0\n" % - 0).encode('utf8')) - request.connection.conn.send(b'.\n') - -class FakeStatsd(object): - def __init__(self): - self.thread = threading.Thread(target=self.run) - self.thread.daemon = True - self.sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) - self.sock.bind(('', 8125)) - self.stats = [] - self.thread.start() - - def run(self): - while True: - poll = select.poll() - poll.register(self.sock, select.POLLIN) - ret = poll.poll() - for (fd, event) in ret: - if fd == self.sock.fileno(): - data = self.sock.recvfrom(1024) - if not data: - return - print data[0] - self.stats.append(data[0]) - -def main(): - logging.basicConfig(level=logging.DEBUG) - context = zmq.Context() - zsocket = context.socket(zmq.PUB) - zsocket.bind("tcp://*:8881") - - geard = MyGearmanServer(statsd_host='localhost', statsd_port=8125, - statsd_prefix='zuul.geard') - geard._count = 0 - - statsd = FakeStatsd() - - print('ready') - while True: - line = raw_input() - command, arg = line.split() - if command == 'queue': - geard._count = int(arg) - elif command == 'start': - topic = 'onStarted' - data = {"name":"test","url":"job/test/","build":{"full_url":"http://localhost:8080/job/test/1/","number":1,"phase":"STARTED","url":"job/test/1/","node_name":arg}} - zsocket.send("%s %s" % (topic, json.dumps(data))) - elif command == 'complete': - topic = 'onFinalized' - data = {"name":"test","url":"job/test/","build":{"full_url":"http://localhost:8080/job/test/1/","number":1,"phase":"FINISHED","status":"SUCCESS","url":"job/test/1/","node_name":arg, "parameters":{"BASE_LOG_PATH":"05/60105/3/gate","LOG_PATH":"05/60105/3/gate/gate-tempest-dsvm-postgres-full/bf0f215","OFFLINE_NODE_WHEN_COMPLETE":"1","ZUUL_BRANCH":"master","ZUUL_CHANGE":"60105","ZUUL_CHANGE_IDS":"60105,3","ZUUL_CHANGES":"openstack/cinder:master:refs/changes/05/60105/3","ZUUL_COMMIT":"ccd02fce4148d5ac2b3e1e68532b55eb5c1c356d","ZUUL_PATCHSET":"3","ZUUL_PIPELINE":"gate","ZUUL_PROJECT":"openstack/cinder","ZUUL_REF":"refs/zuul/master/Z6726d84e57a04ec79585b895ace08f7e","ZUUL_URL":"http://zuul.openstack.org/p","ZUUL_UUID":"bf0f21577026492a985ca98a9ea14cc1"}}} - zsocket.send("%s %s" % (topic, json.dumps(data))) - -if __name__ == '__main__': - main() diff --git a/tools/fake.yaml b/tools/fake.yaml index 0df2ddae1..faf6e87c5 100644 --- a/tools/fake.yaml +++ b/tools/fake.yaml @@ -4,12 +4,6 @@ cron: check: '*/15 * * * *' cleanup: '*/1 * * * *' -zmq-publishers: - - tcp://localhost:8881 - -gearman-servers: - - host: localhost - zookeeper-servers: - host: localhost @@ -56,4 +50,3 @@ providers: targets: - name: zuul - assign-via-gearman: True diff --git a/tools/zmq-stream.py b/tools/zmq-stream.py deleted file mode 100644 index aa0227547..000000000 --- a/tools/zmq-stream.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2012 Hewlett-Packard Development Company, L.P. -# Copyright 2013 OpenStack Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -# A test script to watch a zmq stream -# -# Usage: -# zmq-stream.py - -import zmq - -context = zmq.Context() -socket = context.socket(zmq.SUB) -event_filter = b"" -socket.setsockopt(zmq.SUBSCRIBE, event_filter) -socket.connect("tcp://localhost:8888") - -print('ready') -while True: - m = socket.recv().decode('utf-8') - print(m) From 7f279b4b6972f0fe4b708eec0617b7321c98f3e2 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Thu, 5 Jan 2017 16:35:20 -0500 Subject: [PATCH 008/309] Minor code cleanup Just cleaning up some things that keep annoying me: not properly closing kazoo client connections in the test, and not using a state variable in one place. Change-Id: I0a99e9d81b73af40f91950e9d31e2ef64c5bb3cb --- nodepool/zk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nodepool/zk.py b/nodepool/zk.py index d59d2129a..6f9ffeeb9 100644 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -881,7 +881,7 @@ class ZooKeeper(object): return uploads[:count] def getMostRecentImageUpload(self, image, provider, - state="ready"): + state=READY): ''' Retrieve the most recent image upload data with the given state. From 867480bb4824633f00fe770bd29c17ed4ef33567 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Fri, 6 Jan 2017 13:48:42 -0500 Subject: [PATCH 009/309] Set valid states per data model class Some states are not valid within a certain context. For example, an ImageBuild cannot be in the UPLOADING state, nor can an ImageUpload be in the BUILDING state. Set the list of valid states in each data model class. Change-Id: I5efc3809042d6dc850fe07f0ae0362d1b9870d4c --- nodepool/tests/test_zk.py | 13 ++++++++----- nodepool/zk.py | 9 ++++++--- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/nodepool/tests/test_zk.py b/nodepool/tests/test_zk.py index 426ebe81b..5cc27fbce 100644 --- a/nodepool/tests/test_zk.py +++ b/nodepool/tests/test_zk.py @@ -457,7 +457,7 @@ class TestZooKeeper(tests.DBTestCase): def test_getNodeRequest(self): r = zk.NodeRequest("500-123") - r.state = zk.READY + r.state = zk.REQUESTED path = self.zk._requestPath(r.id) self.zk.client.create(path, value=self.zk._dictToStr(r.toDict()), makepath=True, ephemeral=True) @@ -489,19 +489,19 @@ class TestZKModel(tests.BaseTestCase): def test_BaseModel_toDict(self): o = zk.BaseModel('0001') - o.state = zk.BUILDING d = o.toDict() self.assertNotIn('id', d) - self.assertEqual(o.state, d['state']) - self.assertIsNotNone(d['state_time']) def test_ImageBuild_toDict(self): o = zk.ImageBuild('0001') + o.state = zk.BUILDING o.builder = 'localhost' o.formats = ['qemu', 'raw'] d = o.toDict() self.assertNotIn('id', d) + self.assertEqual(o.state, d['state']) + self.assertIsNotNone(d['state_time']) self.assertEqual(','.join(o.formats), d['formats']) self.assertEqual(o.builder, d['builder']) @@ -524,6 +524,7 @@ class TestZKModel(tests.BaseTestCase): def test_ImageUpload_toDict(self): o = zk.ImageUpload('0001', '0003') + o.state = zk.UPLOADING o.external_id = 'DEADBEEF' o.external_name = 'trusty' @@ -532,6 +533,8 @@ class TestZKModel(tests.BaseTestCase): self.assertNotIn('build_id', d) self.assertNotIn('provider_name', d) self.assertNotIn('image_name', d) + self.assertEqual(o.state, d['state']) + self.assertEqual(o.state_time, d['state_time']) self.assertEqual(o.external_id, d['external_id']) self.assertEqual(o.external_name, d['external_name']) @@ -567,7 +570,7 @@ class TestZKModel(tests.BaseTestCase): now = int(time.time()) req_id = "500-123" d = { - 'state': zk.READY, + 'state': zk.REQUESTED, 'state_time': now } diff --git a/nodepool/zk.py b/nodepool/zk.py index 6f9ffeeb9..ca34fdef8 100644 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -42,8 +42,6 @@ FULFILLED = 'fulfilled' # Node request is being worked. PENDING = 'pending' -STATES = set([BUILDING, UPLOADING, READY, DELETING, FAILED, - REQUESTED, FULFILLED, PENDING]) class ZooKeeperConnectionConfig(object): ''' @@ -114,6 +112,8 @@ class ZooKeeperWatchEvent(object): class BaseModel(object): + VALID_STATES = set([]) + def __init__(self, o_id): if o_id: self.id = o_id @@ -137,7 +137,7 @@ class BaseModel(object): @state.setter def state(self, value): - if value not in STATES: + if value not in self.VALID_STATES: raise TypeError("'%s' is not a valid state" % value) self._state = value self.state_time = time.time() @@ -168,6 +168,7 @@ class ImageBuild(BaseModel): ''' Class representing a DIB image build within the ZooKeeper cluster. ''' + VALID_STATES = set([BUILDING, READY, DELETING, FAILED]) def __init__(self, build_id=None): super(ImageBuild, self).__init__(build_id) @@ -227,6 +228,7 @@ class ImageUpload(BaseModel): ''' Class representing a provider image upload within the ZooKeeper cluster. ''' + VALID_STATES = set([UPLOADING, READY, DELETING, FAILED]) def __init__(self, build_id=None, provider_name=None, image_name=None, upload_id=None): @@ -288,6 +290,7 @@ class NodeRequest(BaseModel): ''' Class representing a node request. ''' + VALID_STATES = set([REQUESTED, PENDING, FULFILLED, FAILED]) def __init__(self, id=None): super(NodeRequest, self).__init__(id) From 4a7ab0e028b66342324dd5dd5a4f4cb1e8b949b4 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Mon, 9 Jan 2017 12:48:54 -0500 Subject: [PATCH 010/309] Create per-provider ZK work threads Each provider receives its own thread for polling the ZK node request queue. Whenever a new provider is added to the config, a new thread will be created for it. Change-Id: I7fceec3b649011544fd2fbec961015bac35c9f21 --- nodepool/nodepool.py | 57 ++++++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 21 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index f6d4c6129..6d1d41bd1 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -671,18 +671,26 @@ class SubNodeLauncher(threading.Thread): return dt -class RequestWorker(threading.Thread): - log = logging.getLogger("nodepool.RequestWorker") - - def __init__(self, request, zk): +class ProviderWorker(threading.Thread): + def __init__(self, zk, provider): threading.Thread.__init__( - self, name='RequestWorker for %s' % request.id + self, name='ProviderWorker.%s' % provider.name ) - self.request = request + self.log = logging.getLogger("nodepool.%s" % self.name) + self.provider = provider self.zk = zk + self.running = False def run(self): - self.log.info("Handling node request %s" % self.request.id) + self.running = True + + while self.running: + self.log.debug("Getting job from ZK queue") + time.sleep(10) + + def stop(self): + self.log.info("%s received stop" % self.name) + self.running = False class NodePool(threading.Thread): @@ -1011,7 +1019,12 @@ class NodePool(threading.Thread): ''' Start point for the NodePool thread. ''' - allocation_history = allocation.AllocationHistory() + + if self.no_launches: + return + + # Provider threads keyed by provider name + provider_threads = {} while not self._stopped: try: @@ -1024,7 +1037,14 @@ class NodePool(threading.Thread): # Make sure we're always registered with ZK self.zk.registerLauncher(self.launcher_id) - self._run(allocation_history) + + # Start provider threads for each provider in the config + for p in self.config.providers.values(): + if p.name not in provider_threads.keys(): + t = ProviderWorker(self.zk, p) + self.log.info( "Starting %s" % t.name) + t.start() + provider_threads[p.name] = t except Exception: self.log.exception("Exception in main loop:") @@ -1032,19 +1052,14 @@ class NodePool(threading.Thread): self._wake_condition.wait(self.watermark_sleep) self._wake_condition.release() - def _run(self, allocation_history): - if self.no_launches: - return + # Stop provider threads + for thd in provider_threads.values(): + if thd.isAlive(): + thd.stop() + self.log.info("Waiting for %s" % thd.name) + thd.join() - for req_id in self.zk.getNodeRequests(): - request = self.zk.getNodeRequest(req_id) - if request.state != zk.REQUESTED: - continue - - worker = RequestWorker(request, self.zk) - worker.start() - - def _run_OLD(self, session, allocation_history): + def _run(self, session, allocation_history): if self.no_launches: return # Make up the subnode deficit first to make sure that an From 096ed90d45438a534b952bd771b79272a32bee8f Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Mon, 9 Jan 2017 13:09:10 -0500 Subject: [PATCH 011/309] Remove --no-launches nodepoold option This isn't needed in v3. Change-Id: I4ebe2383674d149958cf7c55791ebb0da9e22849 --- nodepool/cmd/nodepoold.py | 4 +--- nodepool/nodepool.py | 9 +-------- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/nodepool/cmd/nodepoold.py b/nodepool/cmd/nodepoold.py index 625e57584..5f6b9a6fc 100644 --- a/nodepool/cmd/nodepoold.py +++ b/nodepool/cmd/nodepoold.py @@ -86,7 +86,6 @@ class NodePoolDaemon(nodepool.cmd.NodepoolApp): default=4, help='number of upload workers', type=int) parser.add_argument('--no-deletes', action='store_true') - parser.add_argument('--no-launches', action='store_true') parser.add_argument('--no-webapp', action='store_true') parser.add_argument('--version', dest='version', action='store_true', help='show version') @@ -105,8 +104,7 @@ class NodePoolDaemon(nodepool.cmd.NodepoolApp): self.setup_logging() self.pool = nodepool.nodepool.NodePool(self.args.secure, self.args.config, - self.args.no_deletes, - self.args.no_launches) + self.args.no_deletes) if self.args.builder: log.warning( "Note: nodepool no longer automatically builds images, " diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 6d1d41bd1..7a714157c 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -697,12 +697,11 @@ class NodePool(threading.Thread): log = logging.getLogger("nodepool.NodePool") def __init__(self, securefile, configfile, no_deletes=False, - no_launches=False, watermark_sleep=WATERMARK_SLEEP): + watermark_sleep=WATERMARK_SLEEP): threading.Thread.__init__(self, name='NodePool') self.securefile = securefile self.configfile = configfile self.no_deletes = no_deletes - self.no_launches = no_launches self.watermark_sleep = watermark_sleep self._stopped = False self.config = None @@ -1019,10 +1018,6 @@ class NodePool(threading.Thread): ''' Start point for the NodePool thread. ''' - - if self.no_launches: - return - # Provider threads keyed by provider name provider_threads = {} @@ -1060,8 +1055,6 @@ class NodePool(threading.Thread): thd.join() def _run(self, session, allocation_history): - if self.no_launches: - return # Make up the subnode deficit first to make sure that an # already allocated node has priority in filling its subnodes # ahead of new nodes. From 6d082ddd50c9ba143e1ac3fbcdee9f3808810157 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Mon, 9 Jan 2017 13:26:42 -0500 Subject: [PATCH 012/309] Remove --no-builder nodepoold option This isn't needed in v3. See similar change: I0dee331e41ec39ac809fc863dd833077c7d0edeb Change-Id: Ia10edac698a69fb33103845339968d473bec4dc0 --- nodepool/cmd/nodepoold.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/nodepool/cmd/nodepoold.py b/nodepool/cmd/nodepoold.py index 5f6b9a6fc..8e1337319 100644 --- a/nodepool/cmd/nodepoold.py +++ b/nodepool/cmd/nodepoold.py @@ -74,17 +74,6 @@ class NodePoolDaemon(nodepool.cmd.NodepoolApp): parser.add_argument('-p', dest='pidfile', help='path to pid file', default='/var/run/nodepool/nodepool.pid') - # TODO(pabelanger): Deprecated flag, remove in the future. - parser.add_argument('--no-builder', dest='builder', - action='store_false') - # TODO(pabelanger): Deprecated flag, remove in the future. - parser.add_argument('--build-workers', dest='build_workers', - default=1, help='number of build workers', - type=int) - # TODO(pabelanger): Deprecated flag, remove in the future. - parser.add_argument('--upload-workers', dest='upload_workers', - default=4, help='number of upload workers', - type=int) parser.add_argument('--no-deletes', action='store_true') parser.add_argument('--no-webapp', action='store_true') parser.add_argument('--version', dest='version', action='store_true', @@ -105,16 +94,6 @@ class NodePoolDaemon(nodepool.cmd.NodepoolApp): self.pool = nodepool.nodepool.NodePool(self.args.secure, self.args.config, self.args.no_deletes) - if self.args.builder: - log.warning( - "Note: nodepool no longer automatically builds images, " - "please ensure the separate nodepool-builder process is " - "running if you haven't already") - else: - log.warning( - "--no-builder is deprecated and will be removed in the near " - "future. Update your service scripts to avoid a breakage.") - if not self.args.no_webapp: self.webapp = nodepool.webapp.WebApp(self.pool) From d3a590417e080cb454a789d19b5b954b249fe962 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Mon, 9 Jan 2017 15:56:38 -0500 Subject: [PATCH 013/309] Handle provider removal from config If a provider is removed from the config file, then that provider should stop handling node requests. The provider thread will shut itself down in that case. Also, if a previously present provider is removed and then re-added back to the config at some point, we should recognize that and restart the thread for it. Change-Id: I8eff48f35633d92a993c5979017bd06e7480c35a --- nodepool/nodepool.py | 45 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 7a714157c..491afc5be 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -672,7 +672,7 @@ class SubNodeLauncher(threading.Thread): class ProviderWorker(threading.Thread): - def __init__(self, zk, provider): + def __init__(self, configfile, zk, provider): threading.Thread.__init__( self, name='ProviderWorker.%s' % provider.name ) @@ -680,13 +680,44 @@ class ProviderWorker(threading.Thread): self.provider = provider self.zk = zk self.running = False + self.configfile = configfile + + #---------------------------------------------------------------- + # Private methods + #---------------------------------------------------------------- + + def _updateProvider(self): + ''' + Update the provider definition from the config file. + + If this provider has been removed from the config, we need to + stop processing the request queue. This will effectively cause + this thread to terminate. + ''' + config = nodepool_config.loadConfig(self.configfile) + + if self.provider.name not in config.providers.keys(): + self.log.info("Provider %s removed from config" + % self.provider.name) + self.stop() + + # TODO(Shrews): Should we remove any existing nodes from the + # provider here? + else: + self.provider = config.providers[self.provider.name] + + #---------------------------------------------------------------- + # Public methods + #---------------------------------------------------------------- def run(self): self.running = True while self.running: self.log.debug("Getting job from ZK queue") + # TODO(Shrews): Actually do queue work here time.sleep(10) + self._updateProvider() def stop(self): self.log.info("%s received stop" % self.name) @@ -1033,13 +1064,21 @@ class NodePool(threading.Thread): # Make sure we're always registered with ZK self.zk.registerLauncher(self.launcher_id) - # Start provider threads for each provider in the config + # Start (or restart) provider threads for each provider in + # the config. Removing a provider from the config and then + # adding it back would cause a restart. for p in self.config.providers.values(): if p.name not in provider_threads.keys(): - t = ProviderWorker(self.zk, p) + t = ProviderWorker(self.configfile, self.zk, p) self.log.info( "Starting %s" % t.name) t.start() provider_threads[p.name] = t + elif not provider_threads[p.name].isAlive(): + provider_threads[p.name].join() + t = ProviderWorker(self.configfile, self.zk, p) + self.log.info( "Restarting %s" % t.name) + t.start() + provider_threads[p.name] = t except Exception: self.log.exception("Exception in main loop:") From 22c35e0756631104a9d44a9e808db92e4f4eba85 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Tue, 10 Jan 2017 16:42:32 -0500 Subject: [PATCH 014/309] Add framework for handling node requests Each ProviderWorker will handle node requests and assign those to threads, represented by the new NodeRequestWorker class. A node request is locked before being passed off to a NodeRequestWorker which will mark it as PENDING, process it, and mark it as FULFILLED before releasing the lock. Change-Id: I529a9c6d94bbec1c14b95d12316b8d576e4c2183 --- nodepool/nodepool.py | 75 ++++++++++++++++++++++++++++++++++++++++++-- nodepool/zk.py | 72 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 145 insertions(+), 2 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 491afc5be..2c080c849 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -671,7 +671,54 @@ class SubNodeLauncher(threading.Thread): return dt +class NodeRequestWorker(threading.Thread): + ''' + Class to process a single node request. + + The ProviderWorker thread will instantiate a class of this type for each + node request that it pulls from ZooKeeper. That request will be assigned + to this thread for it to process. + ''' + + def __init__(self, zk, request): + threading.Thread.__init__( + self, name='NodeRequestWorker.%s' % request.id + ) + self.log = logging.getLogger("nodepool.%s" % self.name) + self.zk = zk + self.request = request + + def run(self): + self.log.debug("Handling request %s" % self.request) + try: + self._run() + except Exception: + self.log.exception("Exception in NodeRequestWorker:") + self.request.state = zk.FAILED + self.zk.updateNodeRequest(self.request) + self.zk.unlockNodeRequest(self.request) + + def _run(self): + self.request.state = zk.PENDING + self.zk.updateNodeRequest(self.request) + + # TODO(Shrews): Make magic happen here + + self.request.state = zk.FULFILLED + self.zk.updateNodeRequest(self.request) + self.zk.unlockNodeRequest(self.request) + + class ProviderWorker(threading.Thread): + ''' + Class that manages node requests for a single provider. + + The NodePool thread will instantiate a class of this type for each + provider found in the nodepool configuration file. If the provider to + which this thread is assigned is removed from the configuration file, then + that will be recognized and this thread will shut itself down. + ''' + def __init__(self, configfile, zk, provider): threading.Thread.__init__( self, name='ProviderWorker.%s' % provider.name @@ -714,8 +761,32 @@ class ProviderWorker(threading.Thread): self.running = True while self.running: - self.log.debug("Getting job from ZK queue") - # TODO(Shrews): Actually do queue work here + self.log.debug("Getting node request from ZK queue") + + for req_id in self.zk.getNodeRequests(): + req = self.zk.getNodeRequest(req_id) + if not req: + continue + + # Only interested in unhandled requests + if req.state != zk.REQUESTED: + continue + + try: + self.zk.lockNodeRequest(req, blocking=False) + except exceptions.ZKLockException: + continue + + # Make sure the state didn't change on us + if req.state != zk.REQUESTED: + self.zk.unlockNodeRequest(req) + continue + + # Got a lock, so assign it + self.log.info("Assigning node request %s" % req.id) + t = NodeRequestWorker(self.zk, req) + t.start() + time.sleep(10) self._updateProvider() diff --git a/nodepool/zk.py b/nodepool/zk.py index ca34fdef8..6ded8be27 100644 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -294,6 +294,7 @@ class NodeRequest(BaseModel): def __init__(self, id=None): super(NodeRequest, self).__init__(id) + self.lock = None def __repr__(self): d = self.toDict() @@ -344,6 +345,7 @@ class ZooKeeper(object): IMAGE_ROOT = "/nodepool/images" LAUNCHER_ROOT = "/nodepool/launchers" REQUEST_ROOT = "/nodepool/requests" + REQUEST_LOCK_ROOT = "/nodepool/requests-lock" def __init__(self): ''' @@ -391,6 +393,9 @@ class ZooKeeper(object): def _requestPath(self, request): return "%s/%s" % (self.REQUEST_ROOT, request) + def _requestLockPath(self, request): + return "%s/%s" % (self.REQUEST_LOCK_ROOT, request) + def _dictToStr(self, data): return json.dumps(data) @@ -1103,3 +1108,70 @@ class ZooKeeper(object): d = NodeRequest.fromDict(self._strToDict(data), request) d.stat = stat return d + + def updateNodeRequest(self, request): + ''' + Update a node request. + + The request must already be locked before updating. + + :param NodeRequest request: The node request to update. + ''' + if request.lock is None: + raise Exception("%s must be locked before updating." % request) + + # Validate it still exists before updating + if not self.getNodeRequest(request.id): + raise Exception( + "Attempt to update non-existing request %s" % request) + + path = self._requestPath(request.id) + data = request.toDict() + self.client.set(path, self._dictToStr(data)) + + def lockNodeRequest(self, request, blocking=True, timeout=None): + ''' + Lock a node request. + + This will set the `lock` attribute of the request object when the + lock is successfully acquired. + + :param NodeRequest request: The request to lock. + :param bool blocking: Whether or not to block on trying to + acquire the lock + :param int timeout: When blocking, how long to wait for the lock + to get acquired. None, the default, waits forever. + + :raises: TimeoutException if we failed to acquire the lock when + blocking with a timeout. ZKLockException if we are not blocking + and could not get the lock, or a lock is already held. + ''' + path = self._requestLockPath(request.id) + try: + lock = Lock(self.client, path) + have_lock = lock.acquire(blocking, timeout) + except kze.LockTimeout: + raise npe.TimeoutException( + "Timeout trying to acquire lock %s" % path) + + # If we aren't blocking, it's possible we didn't get the lock + # because someone else has it. + if not have_lock: + raise npe.ZKLockException("Did not get lock on %s" % path) + + request.lock = lock + + def unlockNodeRequest(self, request): + ''' + Unlock a node request. + + The request must already have been locked. + + :param NodeRequest request: The request to unlock. + + :raises: ZKLockException if the request is not currently locked. + ''' + if request.lock is None: + raise npe.ZKLockException("Request %s does not hold a lock" % request) + request.lock.release() + request.lock = None From 8fd774493515a609bb6c7d057ccc6831456236eb Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Thu, 12 Jan 2017 14:50:20 -0500 Subject: [PATCH 015/309] Add support for max_concurrency for providers Add the capability to limit the number of node requests being handled simultaneously for a provider. The default does not force any limit. Change-Id: I49a2638c8003614ab4dc287d157abe873da81421 --- doc/source/configuration.rst | 6 ++ nodepool/cmd/config_validator.py | 1 + nodepool/config.py | 2 + nodepool/nodepool.py | 77 +++++++++++++------ .../tests/fixtures/config_validate/good.yaml | 1 + 5 files changed, 63 insertions(+), 24 deletions(-) diff --git a/doc/source/configuration.rst b/doc/source/configuration.rst index 2fc12d07a..9fb6bbf78 100644 --- a/doc/source/configuration.rst +++ b/doc/source/configuration.rst @@ -438,6 +438,12 @@ provider, the Nodepool image types are also defined (see OpenStack project and will attempt to clean unattached floating ips that may have leaked around restarts. + ``max-concurrency`` + Maximum number of node requests that this provider is allowed to handle + concurrently. The default, if not specified, is to have no maximum. Since + each node request is handled by a separate thread, this can be useful for + limiting the number of threads used by the nodepoold daemon. + .. _images: images diff --git a/nodepool/cmd/config_validator.py b/nodepool/cmd/config_validator.py index 1b8b32f9f..dd3102f01 100644 --- a/nodepool/cmd/config_validator.py +++ b/nodepool/cmd/config_validator.py @@ -66,6 +66,7 @@ class ConfigValidator: 'project-id': str, 'project-name': str, 'max-servers': int, + 'max-concurrency': int, 'pool': str, 'image-type': str, 'networks': [v.Any(old_network, network)], diff --git a/nodepool/config.py b/nodepool/config.py index bb482225d..3db182275 100644 --- a/nodepool/config.py +++ b/nodepool/config.py @@ -50,6 +50,7 @@ class Provider(ConfigValue): other.networks != self.networks or other.ipv6_preferred != self.ipv6_preferred or other.clean_floating_ips != self.clean_floating_ips or + other.max_concurrency != self.max_concurrency or other.azs != self.azs): return False new_images = other.images @@ -174,6 +175,7 @@ def loadConfig(config_path): p.cloud_config = _get_one_cloud(cloud_config, cloud_kwargs) p.region_name = provider.get('region-name') p.max_servers = provider['max-servers'] + p.max_concurrency = provider.get('max-concurrency', -1) p.keypair = provider.get('keypair', None) p.pool = provider.get('pool', None) p.rate = provider.get('rate', 1.0) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 2c080c849..5a56b1ddd 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -728,6 +728,7 @@ class ProviderWorker(threading.Thread): self.zk = zk self.running = False self.configfile = configfile + self.workers = [] #---------------------------------------------------------------- # Private methods @@ -753,6 +754,54 @@ class ProviderWorker(threading.Thread): else: self.provider = config.providers[self.provider.name] + def _processRequests(self): + self.log.debug("Getting node request from ZK queue") + + for req_id in self.zk.getNodeRequests(): + # Short-circuit for limited request handling + if (self.provider.max_concurrency > 0 + and self._activeWorkers() >= self.provider.max_concurrency + ): + return + + req = self.zk.getNodeRequest(req_id) + if not req: + continue + + # Only interested in unhandled requests + if req.state != zk.REQUESTED: + continue + + try: + self.zk.lockNodeRequest(req, blocking=False) + except exceptions.ZKLockException: + continue + + # Make sure the state didn't change on us + if req.state != zk.REQUESTED: + self.zk.unlockNodeRequest(req) + continue + + # Got a lock, so assign it + self.log.info("Assigning node request %s" % req.id) + t = NodeRequestWorker(self.zk, req) + t.start() + self.workers.append(t) + + def _activeWorkers(self): + ''' + Return a count of the number of requests actively being handled. + + This serves the dual-purpose of also removing completed requests from + our list of tracked threads. + ''' + active = [] + for w in self.workers: + if w.isAlive(): + active.append(w) + self.workers = active + return len(self.workers) + #---------------------------------------------------------------- # Public methods #---------------------------------------------------------------- @@ -761,31 +810,11 @@ class ProviderWorker(threading.Thread): self.running = True while self.running: - self.log.debug("Getting node request from ZK queue") + if self.provider.max_concurrency == -1 and self.workers: + self.workers = [] - for req_id in self.zk.getNodeRequests(): - req = self.zk.getNodeRequest(req_id) - if not req: - continue - - # Only interested in unhandled requests - if req.state != zk.REQUESTED: - continue - - try: - self.zk.lockNodeRequest(req, blocking=False) - except exceptions.ZKLockException: - continue - - # Make sure the state didn't change on us - if req.state != zk.REQUESTED: - self.zk.unlockNodeRequest(req) - continue - - # Got a lock, so assign it - self.log.info("Assigning node request %s" % req.id) - t = NodeRequestWorker(self.zk, req) - t.start() + if self.provider.max_concurrency != 0: + self._processRequests() time.sleep(10) self._updateProvider() diff --git a/nodepool/tests/fixtures/config_validate/good.yaml b/nodepool/tests/fixtures/config_validate/good.yaml index 623a2f5ca..1ef7a67f8 100644 --- a/nodepool/tests/fixtures/config_validate/good.yaml +++ b/nodepool/tests/fixtures/config_validate/good.yaml @@ -38,6 +38,7 @@ providers: auth-url: 'https://identity.example.com/v2.0/' boot-timeout: 120 max-servers: 184 + max-concurrency: 10 rate: 0.001 images: - name: trusty From 4f12a9116e5e312f3d0973c3ea86d95117420f55 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Tue, 17 Jan 2017 15:16:40 -0500 Subject: [PATCH 016/309] Begin implementing node request handling. This partially implements the suggested algorithm from the ZuulV3 spec (algorithm temporarily included in the docstring for the NodeRequestWorker). Specifically, this does: - Moves launcher registration from NodePool thread to the ProviderWorker threads so each can properly decline requests. - Skips node requests already declined by the ProviderWorker. - Declines node requests if the requested images are not available for the provider, or if provider quota would be exceeded. - Marks node request as failed if all launchers had declined it. - Adds a new Nodes model class, and a getNodes() and getNode() methods to the ZK API. This does not yet calculate node availability or launch nodes. Change-Id: I103b7d44e9cd1b4544aabd01c31966a3aaa45076 --- nodepool/nodepool.py | 107 +++++++++++++++++++++++++++++++++++--- nodepool/tests/test_zk.py | 53 ++++++++++++++++++- nodepool/zk.py | 96 ++++++++++++++++++++++++++++++++-- 3 files changed, 243 insertions(+), 13 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 5a56b1ddd..619a9df4f 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -680,14 +680,60 @@ class NodeRequestWorker(threading.Thread): to this thread for it to process. ''' - def __init__(self, zk, request): + def __init__(self, zk, launcher_id, provider, request): + ''' + :param ZooKeeper zk: Connected ZooKeeper object. + :param str launcher_id: ID of the launcher handling the request. + :param Provider provider: Provider object from the config file. + :param NodeRequest request: The request to handle. + ''' threading.Thread.__init__( self, name='NodeRequestWorker.%s' % request.id ) self.log = logging.getLogger("nodepool.%s" % self.name) self.zk = zk + self.launcher_id = launcher_id + self.provider = provider self.request = request + def _imagesAvailable(self): + ''' + Determines if the requested images are available for this provider. + + :returns: True if it is available, False otherwise. + ''' + provider_images = self.provider.images.keys() + for node_type in self.request.node_types: + if node_type not in provider_images: + return False + return True + + def _countNodes(self): + ''' + Query ZooKeeper to determine the number of provider nodes launched. + + :returns: An integer for the number launched for this provider. + ''' + count = 0 + for node_id in self.zk.getNodes(): + node = self.zk.getNode(node_id) + if node.provider == self.provider.name: + count += 1 + return count + + def _wouldExceedQuota(self): + ''' + Determines if request would exceed provider quota. + + :returns: True if quota would be exceeded, False otherwise. + ''' + provider_max = self.provider.max_servers + num_requested = len(self.request.node_types) + num_in_use = self._countNodes() + if num_requested + num_in_use > provider_max: + return True + return False + def run(self): self.log.debug("Handling request %s" % self.request) try: @@ -699,6 +745,41 @@ class NodeRequestWorker(threading.Thread): self.zk.unlockNodeRequest(self.request) def _run(self): + ''' + Main body for the NodeRequestWorker. + + note:: This code is a bit racey in its calculation of the number of + nodes in use for quota purposes. It is possible for multiple + launchers to be doing this calculation at the same time. Since we + currently have no locking mechanism around the "in use" + calculation, if we are at the edge of the quota, one of the + launchers could attempt to launch a new node after the other + launcher has already started doing so. This would cause an + expected failure from the underlying library, which is ok for now. + + Algorithm from spec:: + + # If image not available, decline + # If request > quota, decline + # If request < quota and request > available nodes (due to current + usage), begin satisfying the request and do not process further + requests until satisfied + # If request < quota and request < available nodes, satisfy the + request and continue processing further requests + ''' + if not self._imagesAvailable() or self._wouldExceedQuota(): + self.request.declined_by.append(self.launcher_id) + launchers = set(self.zk.getRegisteredLaunchers()) + if launchers.issubset(set(self.request.declined_by)): + # All launchers have declined it + self.request.state = zk.FAILED + self.zk.updateNodeRequest(self.request) + self.zk.unlockNodeRequest(self.request) + return + + # TODO(Shrews): Determine node availability and if we need to launch + # new nodes, or reuse existing nodes. + self.request.state = zk.PENDING self.zk.updateNodeRequest(self.request) @@ -729,6 +810,9 @@ class ProviderWorker(threading.Thread): self.running = False self.configfile = configfile self.workers = [] + self.launcher_id = "%s-%s-%s" % (socket.gethostname(), + os.getpid(), + self.ident) #---------------------------------------------------------------- # Private methods @@ -772,6 +856,10 @@ class ProviderWorker(threading.Thread): if req.state != zk.REQUESTED: continue + # Skip it if we've already declined + if self.launcher_id in req.declined_by: + continue + try: self.zk.lockNodeRequest(req, blocking=False) except exceptions.ZKLockException: @@ -784,7 +872,8 @@ class ProviderWorker(threading.Thread): # Got a lock, so assign it self.log.info("Assigning node request %s" % req.id) - t = NodeRequestWorker(self.zk, req) + t = NodeRequestWorker(self.zk, self.launcher_id, + self.provider, req) t.start() self.workers.append(t) @@ -810,6 +899,14 @@ class ProviderWorker(threading.Thread): self.running = True while self.running: + # Don't do work if we've lost communication with the ZK cluster + while self.zk and (self.zk.suspended or self.zk.lost): + self.log.info("ZooKeeper suspended. Waiting") + time.sleep(SUSPEND_WAIT_TIME) + + # Make sure we're always registered with ZK + self.zk.registerLauncher(self.launcher_id) + if self.provider.max_concurrency == -1 and self.workers: self.workers = [] @@ -844,9 +941,6 @@ class NodePool(threading.Thread): self._instance_delete_threads = {} self._instance_delete_threads_lock = threading.Lock() self._wake_condition = threading.Condition() - self.launcher_id = "%s-%s-%s" % (socket.gethostname(), - os.getpid(), - self.ident) def stop(self): self._stopped = True @@ -1161,9 +1255,6 @@ class NodePool(threading.Thread): self.log.info("ZooKeeper suspended. Waiting") time.sleep(SUSPEND_WAIT_TIME) - # Make sure we're always registered with ZK - self.zk.registerLauncher(self.launcher_id) - # Start (or restart) provider threads for each provider in # the config. Removing a provider from the config and then # adding it back would cause a restart. diff --git a/nodepool/tests/test_zk.py b/nodepool/tests/test_zk.py index 5cc27fbce..3e8a40d3d 100644 --- a/nodepool/tests/test_zk.py +++ b/nodepool/tests/test_zk.py @@ -468,6 +468,26 @@ class TestZooKeeper(tests.DBTestCase): def test_getNodeRequest_not_found(self): self.assertIsNone(self.zk.getNodeRequest("invalid")) + def test_getNodes(self): + self.zk.client.create(self.zk._nodePath('100'), makepath=True) + self.zk.client.create(self.zk._nodePath('200'), makepath=True) + nodes = self.zk.getNodes() + self.assertIn('100', nodes) + self.assertIn('200', nodes) + + def test_getNode(self): + n = zk.Node('100') + n.state = zk.BUILDING + path = self.zk._nodePath(n.id) + self.zk.client.create(path, value=self.zk._dictToStr(n.toDict()), + makepath=True) + o = self.zk.getNode(n.id) + self.assertIsInstance(o, zk.Node) + self.assertEqual(n.id, o.id) + + def test_getNode_not_found(self): + self.assertIsNone(self.zk.getNode("invalid")) + class TestZKModel(tests.BaseTestCase): @@ -561,20 +581,51 @@ class TestZKModel(tests.BaseTestCase): def test_NodeRequest_toDict(self): o = zk.NodeRequest("500-123") + o.declined_by.append("abc") + o.node_types.append('trusty') d = o.toDict() self.assertNotIn('id', d) self.assertIn('state', d) self.assertIn('state_time', d) + self.assertEqual(d['declined_by'], ['abc']) + self.assertEqual(d['node_types'], ['trusty']) def test_NodeRequest_fromDict(self): now = int(time.time()) req_id = "500-123" d = { 'state': zk.REQUESTED, - 'state_time': now + 'state_time': now, + 'declined_by': ['abc'], + 'node_types': ['trusty'], } o = zk.NodeRequest.fromDict(d, req_id) self.assertEqual(o.id, req_id) self.assertEqual(o.state, d['state']) self.assertEqual(o.state_time, d['state_time']) + self.assertEqual(o.declined_by, d['declined_by']) + + def test_Node_toDict(self): + o = zk.Node('123') + o.provider = 'rax' + d = o.toDict() + self.assertNotIn('id', d) + self.assertIn('state', d) + self.assertIn('state_time', d) + self.assertEqual(d['provider'], 'rax') + + def test_Node_fromDict(self): + now = int(time.time()) + node_id = '123' + d = { + 'state': zk.READY, + 'state_time': now, + 'provider': 'rax', + } + + o = zk.Node.fromDict(d, node_id) + self.assertEqual(o.id, node_id) + self.assertEqual(o.state, d['state']) + self.assertEqual(o.state_time, d['state_time']) + self.assertEqual(o.provider, 'rax') diff --git a/nodepool/zk.py b/nodepool/zk.py index 6ded8be27..0924e065e 100644 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -25,13 +25,13 @@ from kazoo.recipe.lock import Lock from nodepool import exceptions as npe # States: -# We are building this image but it is not ready for use. +# We are building this image (or node) but it is not ready for use. BUILDING = 'building' # The image is being uploaded. UPLOADING = 'uploading' -# The image/upload is ready for use. +# The image/upload/node is ready for use. READY = 'ready' -# The image/upload should be deleted. +# The image/upload/node should be deleted. DELETING = 'deleting' # The build failed. FAILED = 'failed' @@ -41,6 +41,14 @@ REQUESTED = 'requested' FULFILLED = 'fulfilled' # Node request is being worked. PENDING = 'pending' +# Node is being tested +TESTING = 'testing' +# Node is being used +IN_USE = 'in-use' +# Node has been used +USED = 'used' +# Node is being held +HOLD = 'hold' class ZooKeeperConnectionConfig(object): @@ -295,6 +303,8 @@ class NodeRequest(BaseModel): def __init__(self, id=None): super(NodeRequest, self).__init__(id) self.lock = None + self.declined_by = [] + self.node_types = [] def __repr__(self): d = self.toDict() @@ -307,6 +317,8 @@ class NodeRequest(BaseModel): Convert a NodeRequest object's attributes to a dictionary. ''' d = super(NodeRequest, self).toDict() + d['declined_by'] = self.declined_by + d['node_types'] = self.node_types return d @staticmethod @@ -317,10 +329,53 @@ class NodeRequest(BaseModel): :param dict d: The dictionary. :param str o_id: The object ID. - :returns: An initialized ImageBuild object. + :returns: An initialized NodeRequest object. ''' o = NodeRequest(o_id) super(NodeRequest, o).fromDict(d) + o.declined_by = d.get('declined_by', []) + o.node_types = d.get('node_types', []) + return o + + +class Node(BaseModel): + ''' + Class representing a launched node. + ''' + VALID_STATES = set([BUILDING, TESTING, READY, IN_USE, USED, + HOLD, DELETING]) + + def __init__(self, id=None): + super(Node, self).__init__(id) + self.provider = None + + def __repr__(self): + d = self.toDict() + d['id'] = self.id + d['stat'] = self.stat + return '' % d + + def toDict(self): + ''' + Convert a Node object's attributes to a dictionary. + ''' + d = super(Node, self).toDict() + d['provider'] = self.provider + return d + + @staticmethod + def fromDict(d, o_id=None): + ''' + Create a Node object from a dictionary. + + :param dict d: The dictionary. + :param str o_id: The object ID. + + :returns: An initialized Node object. + ''' + o = Node(o_id) + super(Node, o).fromDict(d) + o.provider = d.get('provider') return o @@ -344,6 +399,7 @@ class ZooKeeper(object): IMAGE_ROOT = "/nodepool/images" LAUNCHER_ROOT = "/nodepool/launchers" + NODE_ROOT = "/nodepool/nodes" REQUEST_ROOT = "/nodepool/requests" REQUEST_LOCK_ROOT = "/nodepool/requests-lock" @@ -390,6 +446,9 @@ class ZooKeeper(object): def _launcherPath(self, launcher): return "%s/%s" % (self.LAUNCHER_ROOT, launcher) + def _nodePath(self, node): + return "%s/%s" % (self.NODE_ROOT, node) + def _requestPath(self, request): return "%s/%s" % (self.REQUEST_ROOT, request) @@ -1175,3 +1234,32 @@ class ZooKeeper(object): raise npe.ZKLockException("Request %s does not hold a lock" % request) request.lock.release() request.lock = None + + def getNodes(self): + ''' + Get the current list of all nodes. + + :returns: A list of nodes. + ''' + try: + return self.client.get_children(self.NODE_ROOT) + except kze.NoNodeError: + return [] + + def getNode(self, node): + ''' + Get the data for a specific node. + + :param str node: The node ID. + + :returns: The node data, or None if the node was not found. + ''' + path = self._nodePath(node) + try: + data, stat = self.client.get(path) + except kze.NoNodeError: + return None + + d = Node.fromDict(self._strToDict(data), node) + d.stat = stat + return d From 8b44689550c3135b7e5e61afbae4f4a08a89bb7e Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 18 Jan 2017 14:15:36 -0800 Subject: [PATCH 017/309] Add --fake command line option to builder This allows the builder to be run in an otherwise production-like setting but with the actual image build stubbed out. Change-Id: If41428605c13f263da78ebe382ac83b4c1858c42 --- nodepool/builder.py | 10 ++++++++-- nodepool/cmd/builder.py | 5 ++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/nodepool/builder.py b/nodepool/builder.py index 3c047bb2c..c20370904 100644 --- a/nodepool/builder.py +++ b/nodepool/builder.py @@ -1034,13 +1034,15 @@ class NodePoolBuilder(object): ''' log = logging.getLogger("nodepool.builder.NodePoolBuilder") - def __init__(self, config_path, num_builders=1, num_uploaders=4): + def __init__(self, config_path, num_builders=1, num_uploaders=4, + fake=False): ''' Initialize the NodePoolBuilder object. :param str config_path: Path to configuration file. :param int num_builders: Number of build workers to start. :param int num_uploaders: Number of upload workers to start. + :param bool fake: Whether to fake the image builds. ''' self._config_path = config_path self._config = None @@ -1053,7 +1055,11 @@ class NodePoolBuilder(object): self.cleanup_interval = 60 self.build_interval = 10 self.upload_interval = 10 - self.dib_cmd = 'disk-image-create' + if fake: + self.dib_cmd = os.path.join(os.path.dirname(__file__), '..', + 'nodepool/tests/fake-image-create') + else: + self.dib_cmd = 'disk-image-create' self.zk = None # This lock is needed because the run() method is started in a diff --git a/nodepool/cmd/builder.py b/nodepool/cmd/builder.py index 56d96188f..55d3a4370 100644 --- a/nodepool/cmd/builder.py +++ b/nodepool/cmd/builder.py @@ -52,13 +52,16 @@ class NodePoolBuilderApp(nodepool.cmd.NodepoolApp): parser.add_argument('--upload-workers', dest='upload_workers', default=4, help='number of upload workers', type=int) + parser.add_argument('--fake', action='store_true', + help='Do not actually run diskimage-builder ' + '(used for testing)') self.args = parser.parse_args() def main(self): self.setup_logging() self.nb = builder.NodePoolBuilder( self.args.config, self.args.build_workers, - self.args.upload_workers) + self.args.upload_workers, self.args.fake) signal.signal(signal.SIGINT, self.sigint_handler) signal.signal(signal.SIGUSR2, nodepool.cmd.stack_dump_handler) From de4068ecc87af7dbe78d090bbf32d24c485fb177 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Thu, 19 Jan 2017 13:36:40 -0500 Subject: [PATCH 018/309] Fix for launched node counting It's possible a node could be gone between getting the list and pulling the data for the node. Add a check for that. Also, minor code cleanup/improvement for _imagesAvailable() and _wouldExceedQuota(). Change-Id: I95da69baf5078919f559ad20e99d2e2fc05826fd --- nodepool/nodepool.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 619a9df4f..d993e3fab 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -702,11 +702,9 @@ class NodeRequestWorker(threading.Thread): :returns: True if it is available, False otherwise. ''' - provider_images = self.provider.images.keys() - for node_type in self.request.node_types: - if node_type not in provider_images: - return False - return True + provider_images = set(self.provider.images.keys()) + requested_images = set(self.request.node_types) + return requested_images.issubset(provider_images) def _countNodes(self): ''' @@ -717,7 +715,7 @@ class NodeRequestWorker(threading.Thread): count = 0 for node_id in self.zk.getNodes(): node = self.zk.getNode(node_id) - if node.provider == self.provider.name: + if node and node.provider == self.provider.name: count += 1 return count @@ -730,9 +728,7 @@ class NodeRequestWorker(threading.Thread): provider_max = self.provider.max_servers num_requested = len(self.request.node_types) num_in_use = self._countNodes() - if num_requested + num_in_use > provider_max: - return True - return False + return num_requested + num_in_use > provider_max def run(self): self.log.debug("Handling request %s" % self.request) From 3e944292444dcfc74172b4309cbc0a8183c76184 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Thu, 19 Jan 2017 15:30:38 -0500 Subject: [PATCH 019/309] Query ZooKeeper to determine image availability The config is not the true source of wisdom. Only believe that which you can see. Change-Id: Iac826f17c83e4e5de9d6745ae8c613f4776d75be --- nodepool/nodepool.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index d993e3fab..e441c8b6a 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -700,11 +700,15 @@ class NodeRequestWorker(threading.Thread): ''' Determines if the requested images are available for this provider. + ZooKeeper is queried for an image uploaded to the provider that is + in the READY state. + :returns: True if it is available, False otherwise. ''' - provider_images = set(self.provider.images.keys()) - requested_images = set(self.request.node_types) - return requested_images.issubset(provider_images) + for img in self.request.node_types: + if not self.zk.getMostRecentImageUpload(img, self.provider.name): + return False + return True def _countNodes(self): ''' From 37bf79011af93ee4a60221021acbacc8102c6acb Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Fri, 20 Jan 2017 13:21:18 -0500 Subject: [PATCH 020/309] Add a serialize() method to ZK model definitions Avoid the awkward _dictToStr(thing.toDict()) pattern by just having a serialize() method to handle converting objects to save to ZK. Change-Id: I2831595f9f1ced5b93120a7388a1b1a966ab15c8 --- nodepool/tests/test_zk.py | 117 +++++++++++++++++--------------------- nodepool/zk.py | 22 ++++--- 2 files changed, 65 insertions(+), 74 deletions(-) diff --git a/nodepool/tests/test_zk.py b/nodepool/tests/test_zk.py index 3e8a40d3d..5146775cf 100644 --- a/nodepool/tests/test_zk.py +++ b/nodepool/tests/test_zk.py @@ -307,44 +307,38 @@ class TestZooKeeper(tests.DBTestCase): def test_getBuilds_any(self): image = "ubuntu-trusty" path = self.zk._imageBuildsPath(image) - v1 = {'state': zk.READY} - v2 = {'state': zk.BUILDING} - v3 = {'state': zk.FAILED} - v4 = {'state': zk.DELETING} - v5 = {} - self.zk.client.create(path + "/1", value=self.zk._dictToStr(v1), - makepath=True) - self.zk.client.create(path + "/2", value=self.zk._dictToStr(v2), - makepath=True) - self.zk.client.create(path + "/3", value=self.zk._dictToStr(v3), - makepath=True) - self.zk.client.create(path + "/4", value=self.zk._dictToStr(v4), - makepath=True) - self.zk.client.create(path + "/5", value=self.zk._dictToStr(v5), - makepath=True) + v1 = zk.ImageBuild() + v1.state = zk.READY + v2 = zk.ImageBuild() + v2.state = zk.BUILDING + v3 = zk.ImageBuild() + v3.state = zk.FAILED + v4 = zk.ImageBuild() + v4.state = zk.DELETING + self.zk.client.create(path + "/1", value=v1.serialize(), makepath=True) + self.zk.client.create(path + "/2", value=v2.serialize(), makepath=True) + self.zk.client.create(path + "/3", value=v3.serialize(), makepath=True) + self.zk.client.create(path + "/4", value=v4.serialize(), makepath=True) self.zk.client.create(path + "/lock", makepath=True) matches = self.zk.getBuilds(image, None) - self.assertEqual(5, len(matches)) + self.assertEqual(4, len(matches)) def test_getBuilds(self): image = "ubuntu-trusty" path = self.zk._imageBuildsPath(image) - v1 = {'state': zk.BUILDING} - v2 = {'state': zk.READY} - v3 = {'state': zk.FAILED} - v4 = {'state': zk.DELETING} - v5 = {} - self.zk.client.create(path + "/1", value=self.zk._dictToStr(v1), - makepath=True) - self.zk.client.create(path + "/2", value=self.zk._dictToStr(v2), - makepath=True) - self.zk.client.create(path + "/3", value=self.zk._dictToStr(v3), - makepath=True) - self.zk.client.create(path + "/4", value=self.zk._dictToStr(v4), - makepath=True) - self.zk.client.create(path + "/5", value=self.zk._dictToStr(v5), - makepath=True) + v1 = zk.ImageBuild() + v1.state = zk.READY + v2 = zk.ImageBuild() + v2.state = zk.BUILDING + v3 = zk.ImageBuild() + v3.state = zk.FAILED + v4 = zk.ImageBuild() + v4.state = zk.DELETING + self.zk.client.create(path + "/1", value=v1.serialize(), makepath=True) + self.zk.client.create(path + "/2", value=v2.serialize(), makepath=True) + self.zk.client.create(path + "/3", value=v3.serialize(), makepath=True) + self.zk.client.create(path + "/4", value=v4.serialize(), makepath=True) self.zk.client.create(path + "/lock", makepath=True) matches = self.zk.getBuilds(image, [zk.DELETING, zk.FAILED]) @@ -352,21 +346,18 @@ class TestZooKeeper(tests.DBTestCase): def test_getUploads(self): path = self.zk._imageUploadPath("trusty", "000", "rax") - v1 = {'state': zk.READY} - v2 = {'state': zk.UPLOADING} - v3 = {'state': zk.FAILED} - v4 = {'state': zk.DELETING} - v5 = {} - self.zk.client.create(path + "/1", value=self.zk._dictToStr(v1), - makepath=True) - self.zk.client.create(path + "/2", value=self.zk._dictToStr(v2), - makepath=True) - self.zk.client.create(path + "/3", value=self.zk._dictToStr(v3), - makepath=True) - self.zk.client.create(path + "/4", value=self.zk._dictToStr(v4), - makepath=True) - self.zk.client.create(path + "/5", value=self.zk._dictToStr(v5), - makepath=True) + v1 = zk.ImageUpload() + v1.state = zk.READY + v2 = zk.ImageUpload() + v2.state = zk.UPLOADING + v3 = zk.ImageUpload() + v3.state = zk.FAILED + v4 = zk.ImageUpload() + v4.state = zk.DELETING + self.zk.client.create(path + "/1", value=v1.serialize(), makepath=True) + self.zk.client.create(path + "/2", value=v2.serialize(), makepath=True) + self.zk.client.create(path + "/3", value=v3.serialize(), makepath=True) + self.zk.client.create(path + "/4", value=v4.serialize(), makepath=True) self.zk.client.create(path + "/lock", makepath=True) matches = self.zk.getUploads("trusty", "000", "rax", @@ -375,25 +366,22 @@ class TestZooKeeper(tests.DBTestCase): def test_getUploads_any(self): path = self.zk._imageUploadPath("trusty", "000", "rax") - v1 = {'state': zk.READY} - v2 = {'state': zk.UPLOADING} - v3 = {'state': zk.FAILED} - v4 = {'state': zk.DELETING} - v5 = {} - self.zk.client.create(path + "/1", value=self.zk._dictToStr(v1), - makepath=True) - self.zk.client.create(path + "/2", value=self.zk._dictToStr(v2), - makepath=True) - self.zk.client.create(path + "/3", value=self.zk._dictToStr(v3), - makepath=True) - self.zk.client.create(path + "/4", value=self.zk._dictToStr(v4), - makepath=True) - self.zk.client.create(path + "/5", value=self.zk._dictToStr(v5), - makepath=True) + v1 = zk.ImageUpload() + v1.state = zk.READY + v2 = zk.ImageUpload() + v2.state = zk.UPLOADING + v3 = zk.ImageUpload() + v3.state = zk.FAILED + v4 = zk.ImageUpload() + v4.state = zk.DELETING + self.zk.client.create(path + "/1", value=v1.serialize(), makepath=True) + self.zk.client.create(path + "/2", value=v2.serialize(), makepath=True) + self.zk.client.create(path + "/3", value=v3.serialize(), makepath=True) + self.zk.client.create(path + "/4", value=v4.serialize(), makepath=True) self.zk.client.create(path + "/lock", makepath=True) matches = self.zk.getUploads("trusty", "000", "rax", None) - self.assertEqual(5, len(matches)) + self.assertEqual(4, len(matches)) def test_deleteBuild(self): image = 'trusty' @@ -459,7 +447,7 @@ class TestZooKeeper(tests.DBTestCase): r = zk.NodeRequest("500-123") r.state = zk.REQUESTED path = self.zk._requestPath(r.id) - self.zk.client.create(path, value=self.zk._dictToStr(r.toDict()), + self.zk.client.create(path, value=r.serialize(), makepath=True, ephemeral=True) o = self.zk.getNodeRequest(r.id) self.assertIsInstance(o, zk.NodeRequest) @@ -479,8 +467,7 @@ class TestZooKeeper(tests.DBTestCase): n = zk.Node('100') n.state = zk.BUILDING path = self.zk._nodePath(n.id) - self.zk.client.create(path, value=self.zk._dictToStr(n.toDict()), - makepath=True) + self.zk.client.create(path, value=n.serialize(), makepath=True) o = self.zk.getNode(n.id) self.assertIsInstance(o, zk.Node) self.assertEqual(n.id, o.id) diff --git a/nodepool/zk.py b/nodepool/zk.py index 0924e065e..e939805c5 100644 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -171,6 +171,14 @@ class BaseModel(object): if 'state_time' in d: self.state_time = d['state_time'] + def serialize(self): + ''' + Return a representation of the object as a string. + + Used for storing the object data in ZooKeeper. + ''' + return json.dumps(self.toDict()) + class ImageBuild(BaseModel): ''' @@ -455,9 +463,6 @@ class ZooKeeper(object): def _requestLockPath(self, request): return "%s/%s" % (self.REQUEST_LOCK_ROOT, request) - def _dictToStr(self, data): - return json.dumps(data) - def _strToDict(self, data): return json.loads(data) @@ -850,13 +855,13 @@ class ZooKeeper(object): if build_number is None: path = self.client.create( build_path, - value=self._dictToStr(build_data.toDict()), + value=build_data.serialize(), sequence=True, makepath=True) build_number = path.split("/")[-1] else: path = build_path + build_number - self.client.set(path, self._dictToStr(build_data.toDict())) + self.client.set(path, build_data.serialize()) return build_number @@ -1020,13 +1025,13 @@ class ZooKeeper(object): if upload_number is None: path = self.client.create( upload_path, - value=self._dictToStr(image_data.toDict()), + value=image_data.serialize(), sequence=True, makepath=True) upload_number = path.split("/")[-1] else: path = upload_path + upload_number - self.client.set(path, self._dictToStr(image_data.toDict())) + self.client.set(path, image_data.serialize()) return upload_number @@ -1185,8 +1190,7 @@ class ZooKeeper(object): "Attempt to update non-existing request %s" % request) path = self._requestPath(request.id) - data = request.toDict() - self.client.set(path, self._dictToStr(data)) + self.client.set(path, request.serialize()) def lockNodeRequest(self, request, blocking=True, timeout=None): ''' From 77cf53e429ecb0774d520fb158c68fb23a4b911d Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Fri, 20 Jan 2017 14:28:24 -0500 Subject: [PATCH 021/309] Add API methods for locking/unlocking Nodes. Change-Id: I4ec14b90b193b21b4649ceb8c9d0dff25de9e6b8 --- nodepool/tests/test_zk.py | 23 ++++++++++++++++++ nodepool/zk.py | 51 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) diff --git a/nodepool/tests/test_zk.py b/nodepool/tests/test_zk.py index 5146775cf..e1a24201b 100644 --- a/nodepool/tests/test_zk.py +++ b/nodepool/tests/test_zk.py @@ -475,6 +475,29 @@ class TestZooKeeper(tests.DBTestCase): def test_getNode_not_found(self): self.assertIsNone(self.zk.getNode("invalid")) + def test_lockNode_multi(self): + node = zk.Node('100') + self.zk.lockNode(node) + with testtools.ExpectedException( + npe.ZKLockException, "Did not get lock on .*" + ): + self.zk.lockNode(node, blocking=False) + + def test_lockNode_unlockNode(self): + node = zk.Node('100') + self.zk.lockNode(node) + self.assertIsNotNone(node.lock) + self.assertIsNotNone( + self.zk.client.exists(self.zk._nodeLockPath(node.id)) + ) + self.zk.unlockNode(node) + self.assertIsNone(node.lock) + + def test_unlockNode_not_locked(self): + node = zk.Node('100') + with testtools.ExpectedException(npe.ZKLockException): + self.zk.unlockNode(node) + class TestZKModel(tests.BaseTestCase): diff --git a/nodepool/zk.py b/nodepool/zk.py index e939805c5..1237a7bf2 100644 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -355,6 +355,7 @@ class Node(BaseModel): def __init__(self, id=None): super(Node, self).__init__(id) + self.lock = None self.provider = None def __repr__(self): @@ -457,6 +458,9 @@ class ZooKeeper(object): def _nodePath(self, node): return "%s/%s" % (self.NODE_ROOT, node) + def _nodeLockPath(self, node): + return "%s/%s/lock" % (self.NODE_ROOT, node) + def _requestPath(self, request): return "%s/%s" % (self.REQUEST_ROOT, request) @@ -1239,6 +1243,53 @@ class ZooKeeper(object): request.lock.release() request.lock = None + def lockNode(self, node, blocking=True, timeout=None): + ''' + Lock a node. + + This will set the `lock` attribute of the Node object when the + lock is successfully acquired. + + :param Node node: The node to lock. + :param bool blocking: Whether or not to block on trying to + acquire the lock + :param int timeout: When blocking, how long to wait for the lock + to get acquired. None, the default, waits forever. + + :raises: TimeoutException if we failed to acquire the lock when + blocking with a timeout. ZKLockException if we are not blocking + and could not get the lock, or a lock is already held. + ''' + path = self._nodeLockPath(node.id) + try: + lock = Lock(self.client, path) + have_lock = lock.acquire(blocking, timeout) + except kze.LockTimeout: + raise npe.TimeoutException( + "Timeout trying to acquire lock %s" % path) + + # If we aren't blocking, it's possible we didn't get the lock + # because someone else has it. + if not have_lock: + raise npe.ZKLockException("Did not get lock on %s" % path) + + node.lock = lock + + def unlockNode(self, node): + ''' + Unlock a node. + + The node must already have been locked. + + :param Node node: The node to unlock. + + :raises: ZKLockException if the node is not currently locked. + ''' + if node.lock is None: + raise npe.ZKLockException("Node %s does not hold a lock" % node) + node.lock.release() + node.lock = None + def getNodes(self): ''' Get the current list of all nodes. From 5ee68af0014f5abfadcfefc8a6e8057acf365347 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Mon, 23 Jan 2017 14:28:47 -0500 Subject: [PATCH 022/309] Add nodepoold test for request decline and fail Add a test that validates that the node request is declined and then set to failed if it cannot be satisfied. We use an invalid image name in the test to get to the proper code path. Change-Id: I860d16740ac3cac06ae5aedafc7409159d50a566 --- nodepool/nodepool.py | 45 ++++++++++++++++++--------------- nodepool/tests/__init__.py | 22 ++++++++++++++++ nodepool/tests/test_nodepool.py | 15 +++++++++++ 3 files changed, 62 insertions(+), 20 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index e441c8b6a..2994511ef 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -800,7 +800,8 @@ class ProviderWorker(threading.Thread): that will be recognized and this thread will shut itself down. ''' - def __init__(self, configfile, zk, provider): + def __init__(self, configfile, zk, provider, + watermark_sleep=WATERMARK_SLEEP): threading.Thread.__init__( self, name='ProviderWorker.%s' % provider.name ) @@ -810,6 +811,7 @@ class ProviderWorker(threading.Thread): self.running = False self.configfile = configfile self.workers = [] + self.watermark_sleep = watermark_sleep self.launcher_id = "%s-%s-%s" % (socket.gethostname(), os.getpid(), self.ident) @@ -913,7 +915,7 @@ class ProviderWorker(threading.Thread): if self.provider.max_concurrency != 0: self._processRequests() - time.sleep(10) + time.sleep(self.watermark_sleep) self._updateProvider() def stop(self): @@ -936,6 +938,7 @@ class NodePool(threading.Thread): self.apsched = None self.zk = None self.statsd = stats.get_client() + self._provider_threads = {} self._delete_threads = {} self._delete_threads_lock = threading.Lock() self._instance_delete_threads = {} @@ -951,7 +954,17 @@ class NodePool(threading.Thread): provider_manager.ProviderManager.stopProviders(self.config) if self.apsched and self.apsched.running: self.apsched.shutdown() - self.log.debug("finished stopping") + + # Don't let stop() return until all provider threads have been + # terminated. + self.log.debug("Stopping provider threads") + for thd in self._provider_threads.values(): + if thd.isAlive(): + thd.stop() + self.log.debug("Waiting for %s" % thd.name) + thd.join() + + self.log.debug("Finished stopping") def loadConfig(self): self.log.debug("Loading configuration") @@ -1243,9 +1256,6 @@ class NodePool(threading.Thread): ''' Start point for the NodePool thread. ''' - # Provider threads keyed by provider name - provider_threads = {} - while not self._stopped: try: self.updateConfig() @@ -1259,17 +1269,19 @@ class NodePool(threading.Thread): # the config. Removing a provider from the config and then # adding it back would cause a restart. for p in self.config.providers.values(): - if p.name not in provider_threads.keys(): - t = ProviderWorker(self.configfile, self.zk, p) + if p.name not in self._provider_threads.keys(): + t = ProviderWorker(self.configfile, self.zk, p, + self.watermark_sleep) self.log.info( "Starting %s" % t.name) t.start() - provider_threads[p.name] = t - elif not provider_threads[p.name].isAlive(): - provider_threads[p.name].join() - t = ProviderWorker(self.configfile, self.zk, p) + self._provider_threads[p.name] = t + elif not self._provider_threads[p.name].isAlive(): + self._provider_threads[p.name].join() + t = ProviderWorker(self.configfile, self.zk, p, + self.watermark_sleep) self.log.info( "Restarting %s" % t.name) t.start() - provider_threads[p.name] = t + self._provider_threads[p.name] = t except Exception: self.log.exception("Exception in main loop:") @@ -1277,13 +1289,6 @@ class NodePool(threading.Thread): self._wake_condition.wait(self.watermark_sleep) self._wake_condition.release() - # Stop provider threads - for thd in provider_threads.values(): - if thd.isAlive(): - thd.stop() - self.log.info("Waiting for %s" % thd.name) - thd.join() - def _run(self, session, allocation_history): # Make up the subnode deficit first to make sure that an # already allocated node has priority in filling its subnodes diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py index f93cd5d98..4ffe84a34 100644 --- a/nodepool/tests/__init__.py +++ b/nodepool/tests/__init__.py @@ -430,6 +430,28 @@ class DBTestCase(BaseTestCase): time.sleep(1) self.wait_for_threads() + def submitNodeRequest(self, req): + ''' + Very simple submit of a node request to ZooKeeper. + ''' + priority = 100 + req.state = zk.REQUESTED + path = '%s/%s-' % (self.zk.REQUEST_ROOT, priority) + path = self.zk.client.create(path, req.serialize(), makepath=True, + sequence=True, ephemeral=True) + req.id = path.split("/")[-1] + + def waitForNodeRequest(self, req): + ''' + Wait for a node request to transition out of REQUESTED state. + ''' + while True: + req = self.zk.getNodeRequest(req.id) + if req.state != zk.REQUESTED: + break + time.sleep(1) + return req + def useNodepool(self, *args, **kwargs): args = (self.secure_conf,) + args pool = nodepool.NodePool(*args, **kwargs) diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index 40558709e..c2ea2cdc4 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -22,6 +22,7 @@ import fixtures from nodepool import tests from nodepool import nodedb +from nodepool import zk import nodepool.fakeprovider import nodepool.nodepool @@ -29,6 +30,20 @@ import nodepool.nodepool class TestNodepool(tests.DBTestCase): log = logging.getLogger("nodepool.TestNodepool") + def test_decline_and_fail(self): + configfile = self.setup_config('node.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + pool.start() + + req = zk.NodeRequest() + req.node_types.append("zorky-zumba") + self.submitNodeRequest(req) + self.assertEqual(req.state, zk.REQUESTED) + + req = self.waitForNodeRequest(req) + self.assertEqual(req.state, zk.FAILED) + self.assertNotEqual(req.declined_by, []) + @skip("Disabled for early v3 development") def test_node(self): """Test that an image and node are created""" From 7274778c113ae4037d03e2cece5fb94d529998bb Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 18 Jan 2017 14:16:58 -0800 Subject: [PATCH 023/309] Add files for zuul-nodepool integration test This adds config files and a startup script. Remove the nodepool.yaml .gitignore. Change-Id: Icc58521f520f719f24f59132c424b3a71432285f --- .gitignore | 1 - .../builder-logging.conf | 48 +++++++++++++++++++ .../launcher-logging.conf | 48 +++++++++++++++++++ tools/zuul-nodepool-integration/nodepool.yaml | 32 +++++++++++++ tools/zuul-nodepool-integration/secure.conf | 2 + tools/zuul-nodepool-integration/start.sh | 9 ++++ 6 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 tools/zuul-nodepool-integration/builder-logging.conf create mode 100644 tools/zuul-nodepool-integration/launcher-logging.conf create mode 100644 tools/zuul-nodepool-integration/nodepool.yaml create mode 100644 tools/zuul-nodepool-integration/secure.conf create mode 100755 tools/zuul-nodepool-integration/start.sh diff --git a/.gitignore b/.gitignore index 9cce615c2..26e93f51e 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,5 @@ doc/build/* zuul/versioninfo dist/ venv/ -nodepool.yaml *~ .*.swp diff --git a/tools/zuul-nodepool-integration/builder-logging.conf b/tools/zuul-nodepool-integration/builder-logging.conf new file mode 100644 index 000000000..1ac791f19 --- /dev/null +++ b/tools/zuul-nodepool-integration/builder-logging.conf @@ -0,0 +1,48 @@ +[loggers] +keys=root,nodepool,requests,shade + +[handlers] +keys=console,normal + +[formatters] +keys=simple + +[logger_root] +level=WARNING +handlers=console + +[logger_requests] +level=WARNING +handlers=normal +qualname=requests + +[logger_shade] +level=WARNING +handlers=normal +qualname=shade + +[logger_gear] +level=DEBUG +handlers=normal +qualname=gear + +[logger_nodepool] +level=DEBUG +handlers=normal +qualname=nodepool + +[handler_console] +level=WARNING +class=StreamHandler +formatter=simple +args=(sys.stdout,) + +[handler_normal] +level=DEBUG +class=FileHandler +formatter=simple +args=('/tmp/nodepool/log/nodepool-builder.log',) + +[formatter_simple] +format=%(asctime)s %(levelname)s %(name)s: %(message)s +datefmt= diff --git a/tools/zuul-nodepool-integration/launcher-logging.conf b/tools/zuul-nodepool-integration/launcher-logging.conf new file mode 100644 index 000000000..e206606de --- /dev/null +++ b/tools/zuul-nodepool-integration/launcher-logging.conf @@ -0,0 +1,48 @@ +[loggers] +keys=root,nodepool,requests,shade + +[handlers] +keys=console,normal + +[formatters] +keys=simple + +[logger_root] +level=WARNING +handlers=console + +[logger_requests] +level=WARNING +handlers=normal +qualname=requests + +[logger_shade] +level=WARNING +handlers=normal +qualname=shade + +[logger_gear] +level=DEBUG +handlers=normal +qualname=gear + +[logger_nodepool] +level=DEBUG +handlers=normal +qualname=nodepool + +[handler_console] +level=WARNING +class=StreamHandler +formatter=simple +args=(sys.stdout,) + +[handler_normal] +level=DEBUG +class=FileHandler +formatter=simple +args=('/tmp/nodepool/log/nodepool-launcher.log',) + +[formatter_simple] +format=%(asctime)s %(levelname)s %(name)s: %(message)s +datefmt= diff --git a/tools/zuul-nodepool-integration/nodepool.yaml b/tools/zuul-nodepool-integration/nodepool.yaml new file mode 100644 index 000000000..7e34f3b6b --- /dev/null +++ b/tools/zuul-nodepool-integration/nodepool.yaml @@ -0,0 +1,32 @@ +images-dir: /tmp/nodepool/images + +zookeeper-servers: + - host: localhost + +diskimages: + - name: fake-nodepool + elements: + - fedora + - vm + release: 21 + env-vars: + TMPDIR: /opt/dib_tmp + DIB_IMAGE_CACHE: /opt/dib_cache + DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/ + BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2 + +labels: + - name: fake-label + image: fake-nodepool + min-ready: 2 + providers: + - name: fake-provider + +providers: + - name: fake-provider + max-servers: 96 + auth-url: 'fake' + images: + - name: fake-nodepool + min-ram: 8192 + diskimage: fake-nodepool diff --git a/tools/zuul-nodepool-integration/secure.conf b/tools/zuul-nodepool-integration/secure.conf new file mode 100644 index 000000000..d29d9c094 --- /dev/null +++ b/tools/zuul-nodepool-integration/secure.conf @@ -0,0 +1,2 @@ +[database] +dburi=mysql+pymysql://nodepool@localhost/nodepool diff --git a/tools/zuul-nodepool-integration/start.sh b/tools/zuul-nodepool-integration/start.sh new file mode 100755 index 000000000..c5db64e6c --- /dev/null +++ b/tools/zuul-nodepool-integration/start.sh @@ -0,0 +1,9 @@ +#!/bin/bash -e + +cd "$(dirname "$0")" + +mkdir -p /tmp/nodepool/images +mkdir -p /tmp/nodepool/log + +nodepool-builder -c `pwd`/nodepool.yaml -l `pwd`/builder-logging.conf -p /tmp/nodepool/builder.pid --fake +nodepoold -c `pwd`/nodepool.yaml -s `pwd`/secure.conf -l `pwd`/launcher-logging.conf -p /tmp/nodepool/launcher.pid From 578822276d9926fa689980d557cd9a4cedd34c0c Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Tue, 24 Jan 2017 14:21:17 -0500 Subject: [PATCH 024/309] Manage providers in ProviderWorker threads We don't want every provider manager to be available in the config file. Instead, let each thread responsible for the provider have a single ProviderManager that it manages itself. Since this manager is among several attributes that need to be shared with the child NodeRequestWorker threads, let's just pass the parent down to the children and let them grab the attributes needed. Shade integration tests were changed to not require access to nodepool internals and test the actual thing they care about, the ProviderManager. Change-Id: I1533d53ff2cdf7ca51b7e8cc96ba55a3ced1a96c --- nodepool/nodepool.py | 46 +++++++++++++++--------- nodepool/tests/test_shade_integration.py | 27 +++++++++----- 2 files changed, 47 insertions(+), 26 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 2994511ef..144d10a7b 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -680,20 +680,19 @@ class NodeRequestWorker(threading.Thread): to this thread for it to process. ''' - def __init__(self, zk, launcher_id, provider, request): + def __init__(self, pw, request): ''' - :param ZooKeeper zk: Connected ZooKeeper object. - :param str launcher_id: ID of the launcher handling the request. - :param Provider provider: Provider object from the config file. + :param ProviderWorker pw: The parent ProviderWorker object. :param NodeRequest request: The request to handle. ''' threading.Thread.__init__( self, name='NodeRequestWorker.%s' % request.id ) self.log = logging.getLogger("nodepool.%s" % self.name) - self.zk = zk - self.launcher_id = launcher_id - self.provider = provider + self.provider = pw.provider + self.zk = pw.zk + self.manager = pw.manager + self.launcher_id = pw.launcher_id self.request = request def _imagesAvailable(self): @@ -806,12 +805,15 @@ class ProviderWorker(threading.Thread): self, name='ProviderWorker.%s' % provider.name ) self.log = logging.getLogger("nodepool.%s" % self.name) - self.provider = provider - self.zk = zk self.running = False self.configfile = configfile self.workers = [] self.watermark_sleep = watermark_sleep + + # These attributes will be used by NodeRequestWorker children + self.zk = zk + self.manager = None + self.provider = provider self.launcher_id = "%s-%s-%s" % (socket.gethostname(), os.getpid(), self.ident) @@ -834,11 +836,17 @@ class ProviderWorker(threading.Thread): self.log.info("Provider %s removed from config" % self.provider.name) self.stop() - - # TODO(Shrews): Should we remove any existing nodes from the - # provider here? - else: + elif self.provider != config.providers[self.provider.name]: self.provider = config.providers[self.provider.name] + if self.manager: + self.manager.stop() + self.manager = None + + if not self.manager: + self.log.debug("Creating new ProviderManager") + self.manager = provider_manager.ProviderManager( + self.provider, use_taskmanager=True) + self.manager.start() def _processRequests(self): self.log.debug("Getting node request from ZK queue") @@ -874,8 +882,7 @@ class ProviderWorker(threading.Thread): # Got a lock, so assign it self.log.info("Assigning node request %s" % req.id) - t = NodeRequestWorker(self.zk, self.launcher_id, - self.provider, req) + t = NodeRequestWorker(self, req) t.start() self.workers.append(t) @@ -909,6 +916,10 @@ class ProviderWorker(threading.Thread): # Make sure we're always registered with ZK self.zk.registerLauncher(self.launcher_id) + self._updateProvider() + if not self.running: + break + if self.provider.max_concurrency == -1 and self.workers: self.workers = [] @@ -916,11 +927,13 @@ class ProviderWorker(threading.Thread): self._processRequests() time.sleep(self.watermark_sleep) - self._updateProvider() def stop(self): self.log.info("%s received stop" % self.name) self.running = False + if self.manager: + self.manager.stop() + self.manager.join() class NodePool(threading.Thread): @@ -1248,7 +1261,6 @@ class NodePool(threading.Thread): def updateConfig(self): config = self.loadConfig() self.reconfigureZooKeeper(config) - self.reconfigureManagers(config) self.reconfigureCrons(config) self.setConfig(config) diff --git a/nodepool/tests/test_shade_integration.py b/nodepool/tests/test_shade_integration.py index b1430d20a..6da6a1b27 100644 --- a/nodepool/tests/test_shade_integration.py +++ b/nodepool/tests/test_shade_integration.py @@ -20,6 +20,10 @@ import shade import testtools import yaml +from unittest import skip + +from nodepool import config as nodepool_config +from nodepool import provider_manager from nodepool import tests from nodepool.provider_manager import shade_inner_exceptions @@ -43,15 +47,17 @@ class TestShadeIntegration(tests.IntegrationTestCase): def test_nodepool_provider_config(self): configfile = self.setup_config('integration.yaml') - pool = self.useNodepool(configfile, watermark_sleep=1) - pool.updateConfig() - provider_manager = pool.config.provider_managers['real-provider'] + config = nodepool_config.loadConfig(configfile) + self.assertIn('real-provider', config.providers) + pm = provider_manager.ProviderManager( + config.providers['real-provider'], use_taskmanager=False) + pm.start() auth_data = {'username': 'real', 'project_id': 'real', 'password': 'real', 'auth_url': 'real'} - self.assertEqual(provider_manager._client.auth, auth_data) - self.assertEqual(provider_manager._client.region_name, 'real-region') + self.assertEqual(pm._client.auth, auth_data) + self.assertEqual(pm._client.region_name, 'real-region') def test_nodepool_osc_config(self): configfile = self.setup_config('integration_osc.yaml') @@ -62,11 +68,14 @@ class TestShadeIntegration(tests.IntegrationTestCase): osc_config = {'clouds': {'real-cloud': {'auth': auth_data}}} self._use_cloud_config(osc_config) - pool = self.useNodepool(configfile, watermark_sleep=1) - pool.updateConfig() - provider_manager = pool.config.provider_managers['real-provider'] - self.assertEqual(provider_manager._client.auth, auth_data) + config = nodepool_config.loadConfig(configfile) + self.assertIn('real-provider', config.providers) + pm = provider_manager.ProviderManager( + config.providers['real-provider'], use_taskmanager=False) + pm.start() + self.assertEqual(pm._client.auth, auth_data) + @skip("Disabled for early v3 development") def test_nodepool_osc_config_reload(self): configfile = self.setup_config('integration_osc.yaml') auth_data = {'username': 'os_real', From 851b0336608513034765e77707f2ad1a1ff82fa4 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Tue, 31 Jan 2017 13:42:21 -0500 Subject: [PATCH 025/309] Add storeNode ZK API method Besides the new method, also: - allow FAILED state for Node objects - add __eq__ method to Node class - add allocated_to attribute to Node class Change-Id: I14642add3575a86fd18f1fea7e412da38674baf0 --- nodepool/tests/test_zk.py | 23 ++++++++++++++++++++-- nodepool/zk.py | 40 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 60 insertions(+), 3 deletions(-) diff --git a/nodepool/tests/test_zk.py b/nodepool/tests/test_zk.py index e1a24201b..7d3526ee6 100644 --- a/nodepool/tests/test_zk.py +++ b/nodepool/tests/test_zk.py @@ -498,6 +498,21 @@ class TestZooKeeper(tests.DBTestCase): with testtools.ExpectedException(npe.ZKLockException): self.zk.unlockNode(node) + def test_storeNode(self): + node = zk.Node() + node.state = zk.BUILDING + node.provider = 'rax' + + self.assertIsNone(node.id) + self.zk.storeNode(node) + self.assertIsNotNone(node.id) + self.assertIsNotNone( + self.zk.client.exists(self.zk._nodePath(node.id)) + ) + + node2 = self.zk.getNode(node.id) + self.assertEqual(node, node2) + class TestZKModel(tests.BaseTestCase): @@ -619,11 +634,13 @@ class TestZKModel(tests.BaseTestCase): def test_Node_toDict(self): o = zk.Node('123') o.provider = 'rax' + o.allocated_to = '456-789' d = o.toDict() self.assertNotIn('id', d) self.assertIn('state', d) self.assertIn('state_time', d) - self.assertEqual(d['provider'], 'rax') + self.assertEqual(d['provider'], o.provider) + self.assertEqual(d['allocated_to'], o.allocated_to) def test_Node_fromDict(self): now = int(time.time()) @@ -632,10 +649,12 @@ class TestZKModel(tests.BaseTestCase): 'state': zk.READY, 'state_time': now, 'provider': 'rax', + 'allocated_to': '456-789', } o = zk.Node.fromDict(d, node_id) self.assertEqual(o.id, node_id) self.assertEqual(o.state, d['state']) self.assertEqual(o.state_time, d['state_time']) - self.assertEqual(o.provider, 'rax') + self.assertEqual(o.provider, d['provider']) + self.assertEqual(o.allocated_to, d['allocated_to']) diff --git a/nodepool/zk.py b/nodepool/zk.py index 1237a7bf2..22ef518e9 100644 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -125,6 +125,8 @@ class BaseModel(object): def __init__(self, o_id): if o_id: self.id = o_id + else: + self._id = None self._state = None self.state_time = None self.stat = None @@ -351,12 +353,13 @@ class Node(BaseModel): Class representing a launched node. ''' VALID_STATES = set([BUILDING, TESTING, READY, IN_USE, USED, - HOLD, DELETING]) + HOLD, DELETING, FAILED]) def __init__(self, id=None): super(Node, self).__init__(id) self.lock = None self.provider = None + self.allocated_to = None def __repr__(self): d = self.toDict() @@ -364,12 +367,23 @@ class Node(BaseModel): d['stat'] = self.stat return '' % d + def __eq__(self, other): + if isinstance(other, Node): + return (self.id == other.id and + self.state == other.state and + self.state_time == other.state_time and + self.provider == other.provider and + self.allocated_to == other.allocated_to) + else: + return False + def toDict(self): ''' Convert a Node object's attributes to a dictionary. ''' d = super(Node, self).toDict() d['provider'] = self.provider + d['allocated_to'] = self.allocated_to return d @staticmethod @@ -385,6 +399,7 @@ class Node(BaseModel): o = Node(o_id) super(Node, o).fromDict(d) o.provider = d.get('provider') + o.allocated_to = d.get('allocated_to') return o @@ -1316,5 +1331,28 @@ class ZooKeeper(object): return None d = Node.fromDict(self._strToDict(data), node) + d.id = node d.stat = stat return d + + def storeNode(self, node): + ''' + Store an new or existing node. + + If this is a new node, then node.id will be set with the newly created + node identifier. Otherwise, node.id is used to identify the node to + update. + + :param Node node: The Node object to store. + ''' + if not node.id: + node_path = "%s/" % self.NODE_ROOT + path = self.client.create( + node_path, + value=node.serialize(), + sequence=True, + makepath=True) + node.id = path.split("/")[-1] + else: + path = self._nodePath(node) + self.client.set(path, node.serialize()) From cc3cd6747c94211629e3c9ac5df7b8468e003b60 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Tue, 31 Jan 2017 15:22:06 -0500 Subject: [PATCH 026/309] Replace NodeRequestWorker with NodeRequestHandler Eliminate the NodeRequestWorker thread and replace it with an object that will be able to handle polling for doneness. This helps to reduce thread contention since we will eventually be creating even MORE threads (within NodeRequestHandler) to actually launch nodes. Change-Id: I825ac3ec62d9ad797053140f167a0b04da58287f --- nodepool/nodepool.py | 71 ++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 32 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 144d10a7b..1f6edc577 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -671,7 +671,7 @@ class SubNodeLauncher(threading.Thread): return dt -class NodeRequestWorker(threading.Thread): +class NodeRequestHandler(object): ''' Class to process a single node request. @@ -685,15 +685,13 @@ class NodeRequestWorker(threading.Thread): :param ProviderWorker pw: The parent ProviderWorker object. :param NodeRequest request: The request to handle. ''' - threading.Thread.__init__( - self, name='NodeRequestWorker.%s' % request.id - ) - self.log = logging.getLogger("nodepool.%s" % self.name) + self.log = logging.getLogger("nodepool.NodeRequestHandler") self.provider = pw.provider self.zk = pw.zk self.manager = pw.manager self.launcher_id = pw.launcher_id self.request = request + self.done = False def _imagesAvailable(self): ''' @@ -738,14 +736,14 @@ class NodeRequestWorker(threading.Thread): try: self._run() except Exception: - self.log.exception("Exception in NodeRequestWorker:") + self.log.exception("Exception in NodeRequestHandler:") self.request.state = zk.FAILED self.zk.updateNodeRequest(self.request) self.zk.unlockNodeRequest(self.request) def _run(self): ''' - Main body for the NodeRequestWorker. + Main body for the NodeRequestHandler. note:: This code is a bit racey in its calculation of the number of nodes in use for quota purposes. It is possible for multiple @@ -774,6 +772,7 @@ class NodeRequestWorker(threading.Thread): self.request.state = zk.FAILED self.zk.updateNodeRequest(self.request) self.zk.unlockNodeRequest(self.request) + self.done = True return # TODO(Shrews): Determine node availability and if we need to launch @@ -807,10 +806,10 @@ class ProviderWorker(threading.Thread): self.log = logging.getLogger("nodepool.%s" % self.name) self.running = False self.configfile = configfile - self.workers = [] + self.request_handlers = [] self.watermark_sleep = watermark_sleep - # These attributes will be used by NodeRequestWorker children + # These attributes will be used by NodeRequestHandler self.zk = zk self.manager = None self.provider = provider @@ -848,13 +847,28 @@ class ProviderWorker(threading.Thread): self.provider, use_taskmanager=True) self.manager.start() - def _processRequests(self): - self.log.debug("Getting node request from ZK queue") + def _activeThreads(self): + total = 0 + # TODO(Shrews): return a count of active threads + #for r in self.request_handlers: + # total += r.alive_thread_count + return total + + def _assignHandlers(self): + ''' + For each request we can grab, create a NodeRequestHandler for it. + + The NodeRequestHandler object will kick off any threads needed to + satisfy the request, then return. We will need to periodically poll + the handler for completion. + ''' + if self.provider.max_concurrency == 0: + return for req_id in self.zk.getNodeRequests(): # Short-circuit for limited request handling if (self.provider.max_concurrency > 0 - and self._activeWorkers() >= self.provider.max_concurrency + and self._activeThreads() >= self.provider.max_concurrency ): return @@ -882,23 +896,20 @@ class ProviderWorker(threading.Thread): # Got a lock, so assign it self.log.info("Assigning node request %s" % req.id) - t = NodeRequestWorker(self, req) - t.start() - self.workers.append(t) + rh = NodeRequestHandler(self, req) + rh.run() + self.request_handlers.append(rh) - def _activeWorkers(self): + def _removeCompletedHandlers(self): ''' - Return a count of the number of requests actively being handled. - - This serves the dual-purpose of also removing completed requests from - our list of tracked threads. + Poll handlers to see which have completed. ''' - active = [] - for w in self.workers: - if w.isAlive(): - active.append(w) - self.workers = active - return len(self.workers) + active_handlers = [] + # TODO(Shrews): implement handler polling + #for r in self.request_handlers: + # if not r.poll(): + # active_handlers.append(r) + self.request_handlers = active_handlers #---------------------------------------------------------------- # Public methods @@ -920,12 +931,8 @@ class ProviderWorker(threading.Thread): if not self.running: break - if self.provider.max_concurrency == -1 and self.workers: - self.workers = [] - - if self.provider.max_concurrency != 0: - self._processRequests() - + self._assignHandlers() + self._removeCompletedHandlers() time.sleep(self.watermark_sleep) def stop(self): From b10259651769172e85a0bde89b8cdb10932a231b Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Wed, 1 Feb 2017 14:49:50 -0500 Subject: [PATCH 027/309] Fix documentation nits from earlier reviews. Clarify the behavior around setting the 'id' attribute in the ZK model base class, and remove an invalid reference to 'thread' by removing a redundant portion of a docstring sentence. Change-Id: Iec175688898c39478af16ddba7a275e02c451650 --- nodepool/nodepool.py | 3 +-- nodepool/zk.py | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 1f6edc577..dd1517c15 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -676,8 +676,7 @@ class NodeRequestHandler(object): Class to process a single node request. The ProviderWorker thread will instantiate a class of this type for each - node request that it pulls from ZooKeeper. That request will be assigned - to this thread for it to process. + node request that it pulls from ZooKeeper. ''' def __init__(self, pw, request): diff --git a/nodepool/zk.py b/nodepool/zk.py index 22ef518e9..5334d8215 100644 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -124,8 +124,10 @@ class BaseModel(object): def __init__(self, o_id): if o_id: + # Call the setter for id so we can validate the incoming type. self.id = o_id else: + # Bypass the setter for id to set the default. self._id = None self._state = None self.state_time = None From ed260effbabbcca88e05180b9a142a0fc3a373bd Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Thu, 2 Feb 2017 16:06:00 -0500 Subject: [PATCH 028/309] Fix storeNode() for updates The node ID needed to be passed for computing path. Change-Id: I27c740195bac439eb3626a51d47e0ff8cfe489ea --- nodepool/tests/test_zk.py | 12 +++++++++++- nodepool/zk.py | 2 +- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/nodepool/tests/test_zk.py b/nodepool/tests/test_zk.py index 7d3526ee6..31c582e97 100644 --- a/nodepool/tests/test_zk.py +++ b/nodepool/tests/test_zk.py @@ -498,7 +498,7 @@ class TestZooKeeper(tests.DBTestCase): with testtools.ExpectedException(npe.ZKLockException): self.zk.unlockNode(node) - def test_storeNode(self): + def _create_node(self): node = zk.Node() node.state = zk.BUILDING node.provider = 'rax' @@ -509,7 +509,17 @@ class TestZooKeeper(tests.DBTestCase): self.assertIsNotNone( self.zk.client.exists(self.zk._nodePath(node.id)) ) + return node + def test_storeNode(self): + node = self._create_node() + node2 = self.zk.getNode(node.id) + self.assertEqual(node, node2) + + def test_storeNode_update(self): + node = self._create_node() + node.state = zk.READY + self.zk.storeNode(node) node2 = self.zk.getNode(node.id) self.assertEqual(node, node2) diff --git a/nodepool/zk.py b/nodepool/zk.py index 5334d8215..6ba1c6ffb 100644 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -1356,5 +1356,5 @@ class ZooKeeper(object): makepath=True) node.id = path.split("/")[-1] else: - path = self._nodePath(node) + path = self._nodePath(node.id) self.client.set(path, node.serialize()) From 42a19452869fad15fcbedd717cbaeb903bcb8af3 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Fri, 3 Feb 2017 09:02:39 -0500 Subject: [PATCH 029/309] Fix race in node request state changes Tests are currently passing because we quickly change a request state from PENDING to FULFILLED or FAILED. We need to also ignore the PENDING state when waiting for the request to reach final state. Do that by checking for final states in waitForNodeRequest(). Change-Id: Ia720fd8f15baf99e1a96d1aaac484948197559ba --- nodepool/tests/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py index 4ffe84a34..89047ed17 100644 --- a/nodepool/tests/__init__.py +++ b/nodepool/tests/__init__.py @@ -443,11 +443,11 @@ class DBTestCase(BaseTestCase): def waitForNodeRequest(self, req): ''' - Wait for a node request to transition out of REQUESTED state. + Wait for a node request to transition to a final state. ''' while True: req = self.zk.getNodeRequest(req.id) - if req.state != zk.REQUESTED: + if req.state in (zk.FULFILLED, zk.FAILED): break time.sleep(1) return req From a293b7c066cbeb7750ee8da0fde8d1563dc8d210 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Fri, 3 Feb 2017 08:48:29 -0500 Subject: [PATCH 030/309] Update Node and NodeRequest models Adds 'nodes' to NodeRequest and 'type' to Node. Change-Id: I0e2dee371d4caf21e619f7f5dca1ce140bd7c84b --- nodepool/tests/test_zk.py | 13 +++++++++++-- nodepool/zk.py | 7 +++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/nodepool/tests/test_zk.py b/nodepool/tests/test_zk.py index 31c582e97..bbdfc7eef 100644 --- a/nodepool/tests/test_zk.py +++ b/nodepool/tests/test_zk.py @@ -618,12 +618,14 @@ class TestZKModel(tests.BaseTestCase): o = zk.NodeRequest("500-123") o.declined_by.append("abc") o.node_types.append('trusty') + o.nodes.append('100') d = o.toDict() self.assertNotIn('id', d) self.assertIn('state', d) self.assertIn('state_time', d) - self.assertEqual(d['declined_by'], ['abc']) - self.assertEqual(d['node_types'], ['trusty']) + self.assertEqual(d['declined_by'], o.declined_by) + self.assertEqual(d['node_types'], o.node_types) + self.assertEqual(d['nodes'], o.nodes) def test_NodeRequest_fromDict(self): now = int(time.time()) @@ -633,6 +635,7 @@ class TestZKModel(tests.BaseTestCase): 'state_time': now, 'declined_by': ['abc'], 'node_types': ['trusty'], + 'nodes': ['100'], } o = zk.NodeRequest.fromDict(d, req_id) @@ -640,16 +643,20 @@ class TestZKModel(tests.BaseTestCase): self.assertEqual(o.state, d['state']) self.assertEqual(o.state_time, d['state_time']) self.assertEqual(o.declined_by, d['declined_by']) + self.assertEqual(o.node_types, d['node_types']) + self.assertEqual(o.nodes, d['nodes']) def test_Node_toDict(self): o = zk.Node('123') o.provider = 'rax' + o.type = 'trusty' o.allocated_to = '456-789' d = o.toDict() self.assertNotIn('id', d) self.assertIn('state', d) self.assertIn('state_time', d) self.assertEqual(d['provider'], o.provider) + self.assertEqual(d['type'], o.type) self.assertEqual(d['allocated_to'], o.allocated_to) def test_Node_fromDict(self): @@ -659,6 +666,7 @@ class TestZKModel(tests.BaseTestCase): 'state': zk.READY, 'state_time': now, 'provider': 'rax', + 'type': 'trusty', 'allocated_to': '456-789', } @@ -667,4 +675,5 @@ class TestZKModel(tests.BaseTestCase): self.assertEqual(o.state, d['state']) self.assertEqual(o.state_time, d['state_time']) self.assertEqual(o.provider, d['provider']) + self.assertEqual(o.type, d['type']) self.assertEqual(o.allocated_to, d['allocated_to']) diff --git a/nodepool/zk.py b/nodepool/zk.py index 6ba1c6ffb..ba19527bc 100644 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -317,6 +317,7 @@ class NodeRequest(BaseModel): self.lock = None self.declined_by = [] self.node_types = [] + self.nodes = [] def __repr__(self): d = self.toDict() @@ -331,6 +332,7 @@ class NodeRequest(BaseModel): d = super(NodeRequest, self).toDict() d['declined_by'] = self.declined_by d['node_types'] = self.node_types + d['nodes'] = self.nodes return d @staticmethod @@ -347,6 +349,7 @@ class NodeRequest(BaseModel): super(NodeRequest, o).fromDict(d) o.declined_by = d.get('declined_by', []) o.node_types = d.get('node_types', []) + o.nodes = d.get('nodes', []) return o @@ -361,6 +364,7 @@ class Node(BaseModel): super(Node, self).__init__(id) self.lock = None self.provider = None + self.type = None self.allocated_to = None def __repr__(self): @@ -375,6 +379,7 @@ class Node(BaseModel): self.state == other.state and self.state_time == other.state_time and self.provider == other.provider and + self.type == other.type and self.allocated_to == other.allocated_to) else: return False @@ -385,6 +390,7 @@ class Node(BaseModel): ''' d = super(Node, self).toDict() d['provider'] = self.provider + d['type'] = self.type d['allocated_to'] = self.allocated_to return d @@ -401,6 +407,7 @@ class Node(BaseModel): o = Node(o_id) super(Node, o).fromDict(d) o.provider = d.get('provider') + o.type = d.get('type') o.allocated_to = d.get('allocated_to') return o From e27d786b38007f6eff98482e3792833b37b9ee4d Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Tue, 31 Jan 2017 13:12:21 -0500 Subject: [PATCH 031/309] Assign node set to node requests This adds the concept of a NodeLaunchManager that will spawn threads for all new nodes that need launched. It can be polled to see if the launches have completed. NOTE: No nodes are actually launched yet by the new, empty NodeLauncher thread class. Test changes: - add test that nodes are assigned correctly to a request - add test that node request is updated correctly when launching fails - rename test_decline_and_fail to test_invalid_image_fails Change-Id: Ib7b2c9298d5c903610276bf6dfde9eb483d8dac3 --- nodepool/nodepool.py | 275 ++++++++++++++++++++--- nodepool/tests/test_nodelaunchmanager.py | 63 ++++++ nodepool/tests/test_nodepool.py | 59 ++++- nodepool/zk.py | 4 +- 4 files changed, 367 insertions(+), 34 deletions(-) create mode 100644 nodepool/tests/test_nodelaunchmanager.py diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index dd1517c15..517bfe06b 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -208,7 +208,7 @@ class NodeDeleter(threading.Thread): self.node_id) -class NodeLauncher(threading.Thread): +class OLDNodeLauncher(threading.Thread): log = logging.getLogger("nodepool.NodeLauncher") def __init__(self, nodepool, provider, label, target, node_id, timeout, @@ -671,6 +671,103 @@ class SubNodeLauncher(threading.Thread): return dt +class NodeLauncher(threading.Thread): + def __init__(self, zk, node, retries): + threading.Thread.__init__(self) + self._zk = zk + self._node = node + self._retries = retries + + def _launchNode(self): + # TODO(Shrews): Use self._retries here + pass + + def run(self): + try: + self._run() + except Exception: + self._node.state = zk.FAILED + self._zk.storeNode(self._node) + + def _run(self): + self._launchNode() + self._node.state = zk.READY + self._zk.storeNode(self._node) + + +class NodeLaunchManager(object): + ''' + Handle launching multiple nodes in parallel. + ''' + def __init__(self, zk, retries): + self._zk = zk + self._retries = retries + self._nodes = [] + self._failed_nodes = [] + self._ready_nodes = [] + self._threads = [] + + @property + def alive_thread_count(self): + count = 0 + for t in self._threads: + if t.isAlive(): + count += 1 + return count + + @property + def failed_nodes(self): + return self._failed_nodes + + @property + def ready_nodes(self): + return self._ready_nodes + + def launch(self, node): + ''' + Launch a new node as described by the supplied Node. + + We expect each NodeLauncher thread to directly modify the node that + is passed to it. The poll() method will expect to see the node.state + attribute to change as the node is processed. + + :param Node node: The node object. + ''' + self._nodes.append(node) + t = NodeLauncher(self._zk, node, self._retries) + t.start() + self._threads.append(t) + + def poll(self): + ''' + Check if all launch requests have completed. + + When all of the Node objects have reached a final state (READY or + FAILED), we'll know all threads have finished the launch process. + ''' + if not self._threads: + return True + + # Give the NodeLaunch threads time to finish. + if self.alive_thread_count: + return False + + node_states = [node.state for node in self._nodes] + + # NOTE: It very important that NodeLauncher always sets one of + # these states, no matter what. + if not all(s in (zk.READY, zk.FAILED) for s in node_states): + return False + + for node in self._nodes: + if node.state == zk.READY: + self._ready_nodes.append(node) + else: + self._failed_nodes.append(node) + + return True + + class NodeRequestHandler(object): ''' Class to process a single node request. @@ -690,6 +787,8 @@ class NodeRequestHandler(object): self.manager = pw.manager self.launcher_id = pw.launcher_id self.request = request + self.launch_manager = None + self.nodeset = [] self.done = False def _imagesAvailable(self): @@ -730,15 +829,37 @@ class NodeRequestHandler(object): num_in_use = self._countNodes() return num_requested + num_in_use > provider_max - def run(self): - self.log.debug("Handling request %s" % self.request) - try: - self._run() - except Exception: - self.log.exception("Exception in NodeRequestHandler:") - self.request.state = zk.FAILED - self.zk.updateNodeRequest(self.request) - self.zk.unlockNodeRequest(self.request) + def _unlockNodeSet(self): + ''' + Attempt unlocking all Nodes in the object node set. + ''' + for node in self.nodeset: + if not node.lock: + continue + try: + self.zk.unlockNode(node) + except Exception: + self.log.exception("Error unlocking node:") + + def _getReadyNodesOfTypes(self, ntypes): + ''' + Query ZooKeeper for unused/ready nodes. + + :param str ntypes: The node types we want. + + :returns: A dictionary, keyed by node type, with lists of Node objects + that are ready, or an empty dict if none are found. + ''' + ret = {} + for node_id in self.zk.getNodes(): + node = self.zk.getNode(node_id) + if (node and node.state == zk.READY and + not node.allocated_to and node.type in ntypes + ): + if node.type not in ret: + ret[node.type] = [] + ret[node.type].append(node) + return ret def _run(self): ''' @@ -752,16 +873,6 @@ class NodeRequestHandler(object): launchers could attempt to launch a new node after the other launcher has already started doing so. This would cause an expected failure from the underlying library, which is ok for now. - - Algorithm from spec:: - - # If image not available, decline - # If request > quota, decline - # If request < quota and request > available nodes (due to current - usage), begin satisfying the request and do not process further - requests until satisfied - # If request < quota and request < available nodes, satisfy the - request and continue processing further requests ''' if not self._imagesAvailable() or self._wouldExceedQuota(): self.request.declined_by.append(self.launcher_id) @@ -774,17 +885,104 @@ class NodeRequestHandler(object): self.done = True return - # TODO(Shrews): Determine node availability and if we need to launch - # new nodes, or reuse existing nodes. - self.request.state = zk.PENDING self.zk.updateNodeRequest(self.request) - # TODO(Shrews): Make magic happen here + self.launch_manager = NodeLaunchManager(self.zk, retries=3) + ready_nodes = self._getReadyNodesOfTypes(self.request.node_types) - self.request.state = zk.FULFILLED + for ntype in self.request.node_types: + # First try to grab from the list of already available nodes. + got_a_node = False + if ntype in ready_nodes: + for node in ready_nodes[ntype]: + try: + self.zk.lockNode(node, blocking=False) + except exceptions.ZKLockException: + # It's already locked so skip it. + continue + else: + got_a_node = True + node.allocated_to = self.request.id + self.zk.storeNode(node) + self.nodeset.append(node) + break + + # Could not grab an existing node, so launch a new one. + if not got_a_node: + node = zk.Node() + node.state = zk.INIT + node.type = ntype + node.provider = self.provider.name + node.allocated_to = self.request.id + + # Note: It should be safe (i.e., no race) to lock the node + # *after* it is stored since nodes in BUILDING state are not + # locked anywhere. + self.zk.storeNode(node) + self.zk.lockNode(node, blocking=False) + + # Set state AFTER lock so sthat it isn't accidentally cleaned + # up (unlocked BUILDING nodes will be deleted). + node.state = zk.BUILDING + self.zk.storeNode(node) + + # NOTE: We append the node to nodeset if it successfully + # launches. + self.launch_manager.launch(node) + + @property + def alive_thread_count(self): + return self.launch_manager.alive_thread_count + + def run(self): + try: + self._run() + except Exception: + self.log.exception("Exception in NodeRequestHandler:") + self.request.state = zk.FAILED + self.zk.updateNodeRequest(self.request) + self.zk.unlockNodeRequest(self.request) + self.done = True + + def poll(self): + ''' + Check if the request has been handled. + + Once the request has been handled, the 'nodeset' attribute will be + filled with the list of nodes assigned to the request, or it will be + empty if the request could not be fulfilled. + + :returns: True if we are done with the request, False otherwise. + ''' + if self.done: + return True + + if not self.launch_manager.poll(): + return False + + # TODO(Shrews): Verify the request still exists before updating it. + + if self.launch_manager.failed_nodes: + self.nodeset = [] + self.request.declined_by.append(self.launcher_id) + launchers = set(self.zk.getRegisteredLaunchers()) + if launchers.issubset(set(self.request.declined_by)): + # All launchers have declined it + self.request.state = zk.FAILED + else: + self.request.state = zk.REQUESTED + else: + self.nodeset.extend(self.launch_manager.ready_nodes) + for node in self.nodeset: + # Record node ID in the request + self.request.nodes.append(node.id) + self.request.state = zk.FULFILLED + + self._unlockNodeSet() self.zk.updateNodeRequest(self.request) self.zk.unlockNodeRequest(self.request) + return True class ProviderWorker(threading.Thread): @@ -847,10 +1045,16 @@ class ProviderWorker(threading.Thread): self.manager.start() def _activeThreads(self): + ''' + Return the number of alive threads in use by this provider. + + This is an approximate, top-end number for alive threads, since some + threads obviously may have finished by the time we finish the + calculation. + ''' total = 0 - # TODO(Shrews): return a count of active threads - #for r in self.request_handlers: - # total += r.alive_thread_count + for r in self.request_handlers: + total += r.alive_thread_count return total def _assignHandlers(self): @@ -904,10 +1108,9 @@ class ProviderWorker(threading.Thread): Poll handlers to see which have completed. ''' active_handlers = [] - # TODO(Shrews): implement handler polling - #for r in self.request_handlers: - # if not r.poll(): - # active_handlers.append(r) + for r in self.request_handlers: + if not r.poll(): + active_handlers.append(r) self.request_handlers = active_handlers #---------------------------------------------------------------- @@ -935,8 +1138,16 @@ class ProviderWorker(threading.Thread): time.sleep(self.watermark_sleep) def stop(self): + ''' + Shutdown the ProviderWorker thread. + + Do not wait for the request handlers to finish. Any nodes + that are in the process of launching will be cleaned up on a + restart. They will be unlocked and BUILDING in ZooKeeper. + ''' self.log.info("%s received stop" % self.name) self.running = False + if self.manager: self.manager.stop() self.manager.join() diff --git a/nodepool/tests/test_nodelaunchmanager.py b/nodepool/tests/test_nodelaunchmanager.py new file mode 100644 index 000000000..db87121cf --- /dev/null +++ b/nodepool/tests/test_nodelaunchmanager.py @@ -0,0 +1,63 @@ +# Copyright (C) 2017 Red Hat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import mock +import time + +from nodepool import tests +from nodepool import zk +from nodepool.nodepool import NodeLaunchManager + + +class TestNodeLaunchManager(tests.DBTestCase): + log = logging.getLogger("nodepool.TestNodeLaunchManager") + + def test_successful_launch(self): + n1 = zk.Node() + n1.state = zk.BUILDING + mgr = NodeLaunchManager(self.zk, 0) + mgr.launch(n1) + while not mgr.poll(): + time.sleep(0) + self.assertEqual(len(mgr.ready_nodes), 1) + self.assertEqual(len(mgr.failed_nodes), 0) + + @mock.patch('nodepool.nodepool.NodeLauncher._launchNode') + def test_failed_launch(self, mock_launch): + mock_launch.side_effect = Exception() + n1 = zk.Node() + n1.state = zk.BUILDING + mgr = NodeLaunchManager(self.zk, 0) + mgr.launch(n1) + while not mgr.poll(): + time.sleep(0) + self.assertEqual(len(mgr.failed_nodes), 1) + self.assertEqual(len(mgr.ready_nodes), 0) + + @mock.patch('nodepool.nodepool.NodeLauncher._launchNode') + def test_mixed_launch(self, mock_launch): + mock_launch.side_effect = [None, Exception()] + n1 = zk.Node() + n1.state = zk.BUILDING + n2 = zk.Node() + n2.state = zk.BUILDING + mgr = NodeLaunchManager(self.zk, 0) + mgr.launch(n1) + mgr.launch(n2) + while not mgr.poll(): + time.sleep(0) + self.assertEqual(len(mgr.failed_nodes), 1) + self.assertEqual(len(mgr.ready_nodes), 1) diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index c2ea2cdc4..c614a9cbd 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -15,6 +15,7 @@ import json import logging +import mock import time from unittest import skip @@ -30,7 +31,63 @@ import nodepool.nodepool class TestNodepool(tests.DBTestCase): log = logging.getLogger("nodepool.TestNodepool") - def test_decline_and_fail(self): + def test_node_assignment(self): + ''' + Successful node launch should have unlocked nodes in READY state + and assigned to the request. + ''' + configfile = self.setup_config('node.yaml') + self._useBuilder(configfile) + self.waitForImage('fake-provider', 'fake-image') + + pool = self.useNodepool(configfile, watermark_sleep=1) + pool.start() + + req = zk.NodeRequest() + req.node_types.append('fake-image') + self.submitNodeRequest(req) + self.assertEqual(req.state, zk.REQUESTED) + + req = self.waitForNodeRequest(req) + self.assertEqual(req.state, zk.FULFILLED) + + self.assertNotEqual(req.nodes, []) + for node_id in req.nodes: + node = self.zk.getNode(node_id) + self.assertEqual(node.allocated_to, req.id) + self.assertEqual(node.state, zk.READY) + self.zk.lockNode(node, blocking=False) + self.zk.unlockNode(node) + + + @mock.patch('nodepool.nodepool.NodeLauncher._launchNode') + def test_fail_request_on_launch_failure(self, mock_launch): + ''' + Test that provider launch error fails the request. + ''' + mock_launch.side_effect = Exception() + + configfile = self.setup_config('node.yaml') + self._useBuilder(configfile) + self.waitForImage('fake-provider', 'fake-image') + + pool = self.useNodepool(configfile, watermark_sleep=1) + pool.start() + + req = zk.NodeRequest() + req.node_types.append('fake-image') + self.submitNodeRequest(req) + self.assertEqual(req.state, zk.REQUESTED) + + req = self.waitForNodeRequest(req) + self.assertTrue(mock_launch.called) + self.assertEqual(req.state, zk.FAILED) + self.assertNotEqual(req.declined_by, []) + + def test_invalid_image_fails(self): + ''' + Test that an invalid image declines and fails the request. + ''' configfile = self.setup_config('node.yaml') pool = self.useNodepool(configfile, watermark_sleep=1) pool.start() diff --git a/nodepool/zk.py b/nodepool/zk.py index ba19527bc..1eb48c539 100644 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -49,6 +49,8 @@ IN_USE = 'in-use' USED = 'used' # Node is being held HOLD = 'hold' +# Initial node state +INIT = 'init' class ZooKeeperConnectionConfig(object): @@ -358,7 +360,7 @@ class Node(BaseModel): Class representing a launched node. ''' VALID_STATES = set([BUILDING, TESTING, READY, IN_USE, USED, - HOLD, DELETING, FAILED]) + HOLD, DELETING, FAILED, INIT]) def __init__(self, id=None): super(Node, self).__init__(id) From 315ee5256654c36bec36147fae052622884adc5e Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Mon, 6 Feb 2017 14:34:28 -0500 Subject: [PATCH 032/309] Add 'hostname-format' to provider config section We will no longer get the hostname format from the 'target' section of the config file, and instead just grab it from the 'provider' section. Move 'hostname' to 'hostname-format' in the 'provider' section. Also, rename 'template-hostname' to 'image-name-format' since it's used for the external (provider) image names and not really hostnames. Change-Id: I1ab2644d31eb371e1cf6b5e67ed2d2f850cf2464 --- doc/source/configuration.rst | 24 ++++++++++-------------- nodepool/builder.py | 8 ++------ nodepool/cmd/config_validator.py | 5 ++--- nodepool/config.py | 19 +++++++------------ 4 files changed, 21 insertions(+), 35 deletions(-) diff --git a/doc/source/configuration.rst b/doc/source/configuration.rst index c6aaede9f..ae64ba8b3 100644 --- a/doc/source/configuration.rst +++ b/doc/source/configuration.rst @@ -308,7 +308,8 @@ provider, the Nodepool image types are also defined (see - az1 boot-timeout: 120 launch-timeout: 900 - template-hostname: 'template-{image.name}-{timestamp}' + image-name-format: 'template-{image_name}-{timestamp}' + hostname-format: '{label.name}-{provider.name}-{node.id}' ipv6-preferred: False networks: - name: 'some-network-name' @@ -341,7 +342,8 @@ provider, the Nodepool image types are also defined (see region-name: 'region1' max-servers: 96 rate: 1.0 - template-hostname: '{image.name}-{timestamp}-nodepool-template' + image-name-format: 'template-{image_name}-{timestamp}' + hostname-format: '{label.name}-{provider.name}-{node.id}' images: - name: precise min-ram: 8192 @@ -448,9 +450,13 @@ provider, the Nodepool image types are also defined (see ``region-name`` - ``template-hostname`` + ``hostname-format`` Hostname template to use for the spawned instance. - Default ``template-{image.name}-{timestamp}`` + Default ``{label.name}-{provider.name}-{node.id}`` + + ``image-name-format`` + Format for image names that are uploaded to providers. + Default ``template-{image_name}-{timestamp}`` ``rate`` In seconds. Default 1.0. @@ -543,11 +549,7 @@ across all of the targets which are on-line:: targets: - name: jenkins1 - hostname: '{label.name}-{provider.name}-{node_id}' - subnode-hostname: '{label.name}-{provider.name}-{node_id}-{subnode_id}' - name: jenkins2 - hostname: '{label.name}-{provider.name}-{node_id}' - subnode-hostname: '{label.name}-{provider.name}-{node_id}-{subnode_id}' **required** @@ -556,12 +558,6 @@ across all of the targets which are on-line:: **optional** - ``hostname`` - Default ``{label.name}-{provider.name}-{node_id}`` - - ``subnode-hostname`` - Default ``{label.name}-{provider.name}-{node_id}-{subnode_id}`` - ``rate`` In seconds. Default 1.0 diff --git a/nodepool/builder.py b/nodepool/builder.py index 785dde26b..a3457f135 100644 --- a/nodepool/builder.py +++ b/nodepool/builder.py @@ -841,12 +841,8 @@ class UploadWorker(BaseWorker): filename = image.to_path(self._config.imagesdir, with_extension=True) - dummy_image = type('obj', (object,), - {'name': image_name, 'id': image.image_id}) - - ext_image_name = provider.template_hostname.format( - provider=provider, image=dummy_image, - timestamp=str(timestamp) + ext_image_name = provider.image_name_format.format( + image_name=image_name, timestamp=str(timestamp) ) self.log.info("Uploading DIB image build %s from %s to %s" % diff --git a/nodepool/cmd/config_validator.py b/nodepool/cmd/config_validator.py index 3c538dd73..3a8a69553 100644 --- a/nodepool/cmd/config_validator.py +++ b/nodepool/cmd/config_validator.py @@ -76,7 +76,8 @@ class ConfigValidator: 'launch-timeout': int, 'rate': float, 'images': [images], - 'template-hostname': str, + 'hostname-format': str, + 'image-name-format': str, 'clean-floating-ips': bool, } @@ -93,8 +94,6 @@ class ConfigValidator: targets = { 'name': str, - 'hostname': str, - 'subnode-hostname': str, 'jenkins': { 'url': str, 'user': str, diff --git a/nodepool/config.py b/nodepool/config.py index cf714bf8a..30f531d7d 100644 --- a/nodepool/config.py +++ b/nodepool/config.py @@ -198,9 +198,13 @@ def loadConfig(config_path): p.ipv6_preferred = provider.get('ipv6-preferred') p.clean_floating_ips = provider.get('clean-floating-ips') p.azs = provider.get('availability-zones') - p.template_hostname = provider.get( - 'template-hostname', - 'template-{image.name}-{timestamp}' + p.hostname_format = provider.get( + 'hostname-format', + '{label.name}-{provider.name}-{node.id}' + ) + p.image_name_format = provider.get( + 'image-name-format', + 'template-{image_name}-{timestamp}' ) p.image_type = provider.get( 'image-type', p.cloud_config.config['image_format']) @@ -286,15 +290,6 @@ def loadConfig(config_path): t.jenkins_apikey = None t.jenkins_credentials_id = None - t.hostname = target.get( - 'hostname', - '{label.name}-{provider.name}-{node_id}' - ) - t.subnode_hostname = target.get( - 'subnode-hostname', - '{label.name}-{provider.name}-{node_id}-{subnode_id}' - ) - return newconfig From 0d3272fcaf78245da1786bcb3c59df9facd9e971 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Tue, 7 Feb 2017 16:56:54 -0500 Subject: [PATCH 033/309] Unallocate new nodes if request is pulled If a node request is gone when we have finished launching nodes for it, unallocate the nodes to allow another request to grab them. Change-Id: I238cc63ce5a90ea003135c0a9252a4a30d838e00 --- nodepool/nodepool.py | 10 +++++++++- nodepool/zk.py | 2 ++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 517bfe06b..92c5ad038 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -961,7 +961,15 @@ class NodeRequestHandler(object): if not self.launch_manager.poll(): return False - # TODO(Shrews): Verify the request still exists before updating it. + # If the request has been pulled, unallocate the node set so other + # requests can use them. + if not self.zk.getNodeRequest(self.request.id): + self.log.info("Node request %s disappeared", self.request.id) + for node in self.nodeset: + node.allocated_to = None + self.zk.storeNode(node) + self._unlockNodeSet() + return True if self.launch_manager.failed_nodes: self.nodeset = [] diff --git a/nodepool/zk.py b/nodepool/zk.py index 1eb48c539..868c7ad03 100644 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -1191,6 +1191,8 @@ class ZooKeeper(object): ''' Get the data for a specific node request. + :param str request: The request ID. + :returns: The request data, or None if the request was not found. ''' path = self._requestPath(request) From 91c2180baafd0e8a6dae49be3c0d998e4e6f2bec Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Wed, 8 Feb 2017 10:21:41 -0500 Subject: [PATCH 034/309] Add new Node ZK model attributes. This should complete the Node model, based on the representation in the ZuulV3 spec: https://specs.openstack.org/openstack-infra/infra-specs/specs/zuulv3.html Change-Id: Ic2911023b3a7d03a896a2c44c2c401a0889bcc38 --- nodepool/tests/test_zk.py | 31 ++++++++++++++++++++++++++++++- nodepool/zk.py | 39 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 68 insertions(+), 2 deletions(-) diff --git a/nodepool/tests/test_zk.py b/nodepool/tests/test_zk.py index bbdfc7eef..8bdf33ab6 100644 --- a/nodepool/tests/test_zk.py +++ b/nodepool/tests/test_zk.py @@ -648,16 +648,31 @@ class TestZKModel(tests.BaseTestCase): def test_Node_toDict(self): o = zk.Node('123') + o.state = zk.INIT o.provider = 'rax' o.type = 'trusty' o.allocated_to = '456-789' + o.az = 'RegionOne' + o.public_ipv4 = '' + o.private_ipv4 = '' + o.public_ipv6 = '' + o.image_id = 'image-id' + o.launcher = 'launcher-id' + d = o.toDict() self.assertNotIn('id', d) - self.assertIn('state', d) + self.assertEqual(d['state'], o.state) self.assertIn('state_time', d) + self.assertIn('created_time', d) self.assertEqual(d['provider'], o.provider) self.assertEqual(d['type'], o.type) self.assertEqual(d['allocated_to'], o.allocated_to) + self.assertEqual(d['az'], o.az) + self.assertEqual(d['public_ipv4'], o.public_ipv4) + self.assertEqual(d['private_ipv4'], o.private_ipv4) + self.assertEqual(d['public_ipv6'], o.public_ipv6) + self.assertEqual(d['image_id'], o.image_id) + self.assertEqual(d['launcher'], o.launcher) def test_Node_fromDict(self): now = int(time.time()) @@ -665,15 +680,29 @@ class TestZKModel(tests.BaseTestCase): d = { 'state': zk.READY, 'state_time': now, + 'created_time': now - 2, 'provider': 'rax', 'type': 'trusty', 'allocated_to': '456-789', + 'az': 'RegionOne', + 'public_ipv4': '', + 'private_ipv4': '', + 'public_ipv6': '', + 'image_id': 'image-id', + 'launcher': 'launcher-id', } o = zk.Node.fromDict(d, node_id) self.assertEqual(o.id, node_id) self.assertEqual(o.state, d['state']) self.assertEqual(o.state_time, d['state_time']) + self.assertEqual(o.created_time, d['created_time']) self.assertEqual(o.provider, d['provider']) self.assertEqual(o.type, d['type']) self.assertEqual(o.allocated_to, d['allocated_to']) + self.assertEqual(o.az, d['az']) + self.assertEqual(o.public_ipv4, d['public_ipv4']) + self.assertEqual(o.private_ipv4, d['private_ipv4']) + self.assertEqual(o.public_ipv6, d['public_ipv6']) + self.assertEqual(o.image_id, d['image_id']) + self.assertEqual(o.launcher, d['launcher']) diff --git a/nodepool/zk.py b/nodepool/zk.py index 868c7ad03..0a7e2e398 100644 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -368,6 +368,13 @@ class Node(BaseModel): self.provider = None self.type = None self.allocated_to = None + self.az = None + self.public_ipv4 = None + self.private_ipv4 = None + self.public_ipv6 = None + self.image_id = None + self.launcher = None + self.created_time = None def __repr__(self): d = self.toDict() @@ -382,7 +389,14 @@ class Node(BaseModel): self.state_time == other.state_time and self.provider == other.provider and self.type == other.type and - self.allocated_to == other.allocated_to) + self.allocated_to == other.allocated_to and + self.az == other.az and + self.public_ipv4 == other.public_ipv4 and + self.private_ipv4 == other.private_ipv4 and + self.public_ipv6 == other.public_ipv6 and + self.image_id == other.image_id and + self.launcher == other.launcher and + self.created_time == other.created_time) else: return False @@ -394,6 +408,13 @@ class Node(BaseModel): d['provider'] = self.provider d['type'] = self.type d['allocated_to'] = self.allocated_to + d['az'] = self.az + d['public_ipv4'] = self.public_ipv4 + d['private_ipv4'] = self.private_ipv4 + d['public_ipv6'] = self.public_ipv6 + d['image_id'] = self.image_id + d['launcher'] = self.launcher + d['created_time'] = self.created_time return d @staticmethod @@ -411,6 +432,13 @@ class Node(BaseModel): o.provider = d.get('provider') o.type = d.get('type') o.allocated_to = d.get('allocated_to') + o.az = d.get('az') + o.public_ipv4 = d.get('public_ipv4') + o.private_ipv4 = d.get('private_ipv4') + o.public_ipv6 = d.get('public_ipv6') + o.image_id = d.get('image_id') + o.launcher = d.get('launcher') + o.created_time = d.get('created_time') return o @@ -1360,6 +1388,15 @@ class ZooKeeper(object): ''' if not node.id: node_path = "%s/" % self.NODE_ROOT + + # We expect a new node to always have a state already set, so + # use that state_time for created_time for consistency. But have + # this check, just in case. + if node.state_time: + node.created_time = node.state_time + else: + node.created_time = time.time() + path = self.client.create( node_path, value=node.serialize(), From dd6de5b506f8fe03939055fad6ae1382ff9cd0a1 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Thu, 9 Feb 2017 09:12:46 -0500 Subject: [PATCH 035/309] Remove 'template-' from image name format Change-Id: I5d37fc28752125f42c9c7fa1a2869486187f3b22 --- nodepool/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nodepool/config.py b/nodepool/config.py index 30f531d7d..1d218ed58 100644 --- a/nodepool/config.py +++ b/nodepool/config.py @@ -204,7 +204,7 @@ def loadConfig(config_path): ) p.image_name_format = provider.get( 'image-name-format', - 'template-{image_name}-{timestamp}' + '{image_name}-{timestamp}' ) p.image_type = provider.get( 'image-type', p.cloud_config.config['image_format']) From b27b4798a4a87530d03396172989a1d6f1cd226e Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Fri, 10 Feb 2017 12:37:21 -0500 Subject: [PATCH 036/309] Remove subnodes from nodepool As we move closer to zuulv3, the concept of sub-nodes no longer applies. As a result, we can start to remove the legacy code making it easier to re-enable our unit tests in the future. Change-Id: If964e082bb56b32c8fbc0f3539b83629976fe041 Signed-off-by: Paul Belanger --- doc/source/configuration.rst | 12 - nodepool/allocation.py | 10 +- nodepool/cmd/config_validator.py | 1 - nodepool/config.py | 1 - nodepool/nodedb.py | 76 +---- nodepool/nodepool.py | 264 +----------------- .../tests/fixtures/config_validate/good.yaml | 1 - .../fixtures/config_validate/yaml_error.yaml | 1 - nodepool/tests/fixtures/subnodes.yaml | 61 ---- nodepool/tests/test_allocator.py | 32 +-- nodepool/tests/test_nodepool.py | 63 ----- tools/fake.yaml | 1 - 12 files changed, 27 insertions(+), 496 deletions(-) delete mode 100644 nodepool/tests/fixtures/subnodes.yaml diff --git a/doc/source/configuration.rst b/doc/source/configuration.rst index ae64ba8b3..1f4eea811 100644 --- a/doc/source/configuration.rst +++ b/doc/source/configuration.rst @@ -147,7 +147,6 @@ providers or images are used to create them). Example:: - name: provider2 - name: multi-precise image: precise - subnodes: 2 min-ready: 2 ready-script: setup_multinode.sh providers: @@ -172,17 +171,6 @@ providers or images are used to create them). Example:: label considered disabled. ``min-ready`` is best-effort based on available capacity and is not a guaranteed allocation. - ``subnodes`` - Used to configure multi-node support. If a `subnodes` key is supplied to - an image, it indicates that the specified number of additional nodes of the - same image type should be created and associated with each node for that - image. - - Only one node from each such group will be added to the target, the - subnodes are expected to communicate directly with each other. In the - example above, for each Precise node added to the target system, two - additional nodes will be created and associated with it. - ``ready-script`` A script to be used to perform any last minute changes to a node after it has been launched but before it is put in the READY state to receive jobs. diff --git a/nodepool/allocation.py b/nodepool/allocation.py index cf34afb18..c834dffc0 100644 --- a/nodepool/allocation.py +++ b/nodepool/allocation.py @@ -236,11 +236,11 @@ class AllocationRequest(object): art = AllocationRequestTarget(self, target, current) self.request_targets[target] = art - def addProvider(self, provider, target, subnodes): + def addProvider(self, provider, target): # Handle being called multiple times with different targets. s = self.sub_requests.get(provider) if not s: - s = AllocationSubRequest(self, provider, subnodes) + s = AllocationSubRequest(self, provider) agt = s.addTarget(self.request_targets[target]) self.sub_requests[provider] = s if s not in provider.sub_requests: @@ -263,11 +263,10 @@ class AllocationRequest(object): class AllocationSubRequest(object): """A request for a number of images from a specific provider.""" - def __init__(self, request, provider, subnodes): + def __init__(self, request, provider): self.request = request self.provider = provider self.amount = 0.0 - self.subnodes = subnodes self.targets = [] def __repr__(self): @@ -313,8 +312,7 @@ class AllocationSubRequest(object): self.amount = amount # Adjust provider and request values accordingly. self.request.amount -= amount - subnode_factor = 1 + self.subnodes - self.provider.available -= (amount * subnode_factor) + self.provider.available -= (amount) # Adjust the requested values for related sub-requests. self.request.makeRequests() # Allocate these granted nodes to targets. diff --git a/nodepool/cmd/config_validator.py b/nodepool/cmd/config_validator.py index 3a8a69553..65afd65cc 100644 --- a/nodepool/cmd/config_validator.py +++ b/nodepool/cmd/config_validator.py @@ -86,7 +86,6 @@ class ConfigValidator: 'image': str, 'min-ready': int, 'ready-script': str, - 'subnodes': int, 'providers': [{ 'name': str, }], diff --git a/nodepool/config.py b/nodepool/config.py index 1d218ed58..7a49152ba 100644 --- a/nodepool/config.py +++ b/nodepool/config.py @@ -269,7 +269,6 @@ def loadConfig(config_path): newconfig.labels[l.name] = l l.image = label['image'] l.min_ready = label.get('min-ready', 2) - l.subnodes = label.get('subnodes', 0) l.ready_script = label.get('ready-script') l.providers = {} for provider in label['providers']: diff --git a/nodepool/nodedb.py b/nodepool/nodedb.py index 62abb29da..d7ae959b7 100644 --- a/nodepool/nodedb.py +++ b/nodepool/nodedb.py @@ -43,7 +43,7 @@ STATE_NAMES = { from sqlalchemy import Table, Column, Integer, String, \ MetaData, create_engine -from sqlalchemy.orm import scoped_session, mapper, relationship, foreign +from sqlalchemy.orm import scoped_session, mapper from sqlalchemy.orm.session import Session, sessionmaker metadata = MetaData() @@ -75,24 +75,6 @@ node_table = Table( Column('comment', String(255)), mysql_engine='InnoDB', ) -subnode_table = Table( - 'subnode', metadata, - Column('id', Integer, primary_key=True), - Column('node_id', Integer, index=True, nullable=False), - # Machine name - Column('hostname', String(255), index=True), - # Provider assigned id for this machine - Column('external_id', String(255)), - # Primary IP address - Column('ip', String(255)), - # Internal/fixed IP address - Column('ip_private', String(255)), - # One of the above values - Column('state', Integer), - # Time of last state change - Column('state_time', Integer), - mysql_engine='InnoDB', - ) job_table = Table( 'job', metadata, Column('id', Integer, primary_key=True), @@ -138,38 +120,6 @@ class Node(object): session.commit() -class SubNode(object): - def __init__(self, node, - hostname=None, external_id=None, ip=None, ip_private=None, - state=BUILDING): - self.node_id = node.id - self.provider_name = node.provider_name - self.label_name = node.label_name - self.target_name = node.target_name - self.external_id = external_id - self.ip = ip - self.ip_private = ip_private - self.hostname = hostname - self.state = state - - def delete(self): - session = Session.object_session(self) - session.delete(self) - session.commit() - - @property - def state(self): - return self._state - - @state.setter - def state(self, state): - self._state = state - self.state_time = int(time.time()) - session = Session.object_session(self) - if session: - session.commit() - - class Job(object): def __init__(self, name=None, hold_on_failure=0): self.name = name @@ -184,19 +134,9 @@ class Job(object): mapper(Job, job_table) -mapper(SubNode, subnode_table, - properties=dict(_state=subnode_table.c.state)) - - mapper(Node, node_table, properties=dict( - _state=node_table.c.state, - subnodes=relationship( - SubNode, - cascade='all, delete-orphan', - uselist=True, - primaryjoin=foreign(subnode_table.c.node_id) == node_table.c.id, - backref='node'))) + _state=node_table.c.state)) class NodeDatabase(object): @@ -259,24 +199,12 @@ class NodeDatabaseSession(object): self.commit() return new - def createSubNode(self, *args, **kwargs): - new = SubNode(*args, **kwargs) - self.session().add(new) - self.commit() - return new - def getNode(self, id): nodes = self.session().query(Node).filter_by(id=id).all() if not nodes: return None return nodes[0] - def getSubNode(self, id): - nodes = self.session().query(SubNode).filter_by(id=id).all() - if not nodes: - return None - return nodes[0] - def getNodeByHostname(self, hostname): nodes = self.session().query(Node).filter_by(hostname=hostname).all() if not nodes: diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 92c5ad038..d08c1101e 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -364,20 +364,7 @@ class OLDNodeLauncher(threading.Thread): # Save the elapsed time for statsd dt = int((time.time() - start_time) * 1000) - if self.label.subnodes: - self.log.info("Node id: %s is waiting on subnodes" % self.node.id) - - while ((time.time() - start_time) < (NODE_CLEANUP - 60)): - session.commit() - ready_subnodes = [n for n in self.node.subnodes - if n.state == nodedb.READY] - if len(ready_subnodes) == self.label.subnodes: - break - time.sleep(5) - nodelist = [] - for subnode in self.node.subnodes: - nodelist.append(('sub', subnode)) nodelist.append(('primary', self.node)) self.writeNodepoolInfo(nodelist) @@ -460,16 +447,6 @@ class OLDNodeLauncher(threading.Thread): f = ftp.open('/etc/nodepool/primary_node_private', 'w') f.write(self.node.ip_private + '\n') f.close() - # The IPs of all sub nodes in this node set - f = ftp.open('/etc/nodepool/sub_nodes', 'w') - for subnode in self.node.subnodes: - f.write(subnode.ip + '\n') - f.close() - # The private IPs of all sub nodes in this node set - f = ftp.open('/etc/nodepool/sub_nodes_private', 'w') - for subnode in self.node.subnodes: - f.write(subnode.ip_private + '\n') - f.close() # The SSH key for this node set f = ftp.open('/etc/nodepool/id_rsa', 'w') key.write_private_key(f) @@ -511,166 +488,6 @@ class OLDNodeLauncher(threading.Thread): output=True) -class SubNodeLauncher(threading.Thread): - log = logging.getLogger("nodepool.SubNodeLauncher") - - def __init__(self, nodepool, provider, label, subnode_id, - node_id, node_target_name, timeout, launch_timeout, node_az, - manager_name): - threading.Thread.__init__(self, name='SubNodeLauncher for %s' - % subnode_id) - self.provider = provider - self.label = label - self.image = provider.images[label.image] - self.node_target_name = node_target_name - self.subnode_id = subnode_id - self.node_id = node_id - self.timeout = timeout - self.nodepool = nodepool - self.launch_timeout = launch_timeout - self.node_az = node_az - self.manager_name = manager_name - - def run(self): - try: - self._run() - except Exception: - self.log.exception("Exception in run method:") - - def _run(self): - with self.nodepool.getDB().getSession() as session: - self.log.debug("Launching subnode id: %s for node id: %s" % - (self.subnode_id, self.node_id)) - try: - self.subnode = session.getSubNode(self.subnode_id) - self.manager = self.nodepool.getProviderManager(self.provider) - except Exception: - self.log.exception("Exception preparing to launch subnode " - "id: %s for node id: %s:" - % (self.subnode_id, self.node_id)) - return - - try: - start_time = time.time() - dt = self.launchSubNode(session) - failed = False - statsd_key = 'ready' - except Exception as e: - self.log.exception("%s launching subnode id: %s " - "for node id: %s in provider: %s error:" % - (e.__class__.__name__, self.subnode_id, - self.node_id, self.provider.name)) - dt = int((time.time() - start_time) * 1000) - failed = True - if hasattr(e, 'statsd_key'): - statsd_key = e.statsd_key - else: - statsd_key = 'error.unknown' - - try: - self.nodepool.launchStats(statsd_key, dt, self.image.name, - self.provider.name, - self.node_target_name, - self.node_az, - self.manager_name) - except Exception: - self.log.exception("Exception reporting launch stats:") - - if failed: - try: - self.nodepool.deleteSubNode(self.subnode, self.manager) - except Exception: - self.log.exception("Exception deleting subnode id: %s: " - "for node id: %s:" % - (self.subnode_id, self.node_id)) - return - - def launchSubNode(self, session): - start_time = time.time() - timestamp = int(start_time) - - target = self.nodepool.config.targets[self.node_target_name] - hostname = target.subnode_hostname.format( - label=self.label, provider=self.provider, node_id=self.node_id, - subnode_id=self.subnode_id, timestamp=str(timestamp)) - self.subnode.hostname = hostname - self.subnode.nodename = hostname.split('.')[0] - - cloud_image = self.nodepool.zk.getMostRecentImageUpload( - self.image.name, self.provider.name) - if not cloud_image: - raise LaunchNodepoolException("Unable to find current cloud " - "image %s in %s" % - (self.image.name, - self.provider.name)) - - self.log.info("Creating server with hostname %s in %s from image %s " - "for subnode id: %s for node id: %s" - % (hostname, self.provider.name, - self.image.name, self.subnode_id, self.node_id)) - server = self.manager.createServer( - hostname, self.image.min_ram, cloud_image.external_id, - name_filter=self.image.name_filter, az=self.node_az, - config_drive=self.image.config_drive, - nodepool_node_id=self.node_id, - nodepool_image_name=self.image.name) - server_id = server['id'] - self.subnode.external_id = server_id - session.commit() - - self.log.debug("Waiting for server %s for subnode id: %s for " - "node id: %s" % - (server_id, self.subnode_id, self.node_id)) - server = self.manager.waitForServer(server, self.launch_timeout) - if server['status'] != 'ACTIVE': - raise LaunchStatusException("Server %s for subnode id: " - "%s for node id: %s " - "status: %s" % - (server_id, self.subnode_id, - self.node_id, server['status'])) - - ip = server.get('public_v4') - ip_v6 = server.get('public_v6') - if self.provider.ipv6_preferred: - if ip_v6: - ip = ip_v6 - else: - self.log.warning('Preferred ipv6 not available, ' - 'falling back to ipv4.') - if not ip: - raise LaunchNetworkException("Unable to find public IP of server") - - self.subnode.ip_private = server.get('private_v4') - # devstack-gate multi-node depends on private_v4 being populated - # with something. On clouds that don't have a private address, use - # the public. - if not self.subnode.ip_private: - self.subnode.ip_private = server.get('public_v4') - self.subnode.ip = ip - self.log.debug("Subnode id: %s for node id: %s is running, " - "ipv4: %s, ipv6: %s" % - (self.subnode_id, self.node_id, server.get('public_v4'), - server.get('public_v6'))) - - self.log.debug("Subnode id: %s for node id: %s testing ssh at ip: %s" % - (self.subnode_id, self.node_id, ip)) - connect_kwargs = dict(key_filename=self.image.private_key) - if not utils.ssh_connect(ip, self.image.username, - connect_kwargs=connect_kwargs, - timeout=self.timeout): - raise LaunchAuthException("Unable to connect via ssh") - - # Save the elapsed time for statsd - dt = int((time.time() - start_time) * 1000) - - self.subnode.state = nodedb.READY - self.log.info("Subnode id: %s for node id: %s is ready" - % (self.subnode_id, self.node_id)) - self.nodepool.updateStats(session, self.provider.name) - - return dt - - class NodeLauncher(threading.Thread): def __init__(self, zk, node, retries): threading.Thread.__init__(self) @@ -1340,12 +1157,12 @@ class NodePool(threading.Thread): n.label_name == label_name and n.state == state)]) - def count_nodes_and_subnodes(provider_name): + def count_provider_nodes(provider_name): count = 0 for n in nodes: if n.provider_name != provider_name: continue - count += 1 + len(n.subnodes) + count += 1 return count # Add a provider for each node provider, along with current @@ -1353,7 +1170,7 @@ class NodePool(threading.Thread): allocation_providers = {} for provider in self.config.providers.values(): provider_max = provider.max_servers - n_provider = count_nodes_and_subnodes(provider.name) + n_provider = count_provider_nodes(provider.name) available = provider_max - n_provider if available < 0: self.log.warning("Provider %s over-allocated: " @@ -1436,7 +1253,7 @@ class NodePool(threading.Thread): # request should be distributed to this target). sr, agt = ar.addProvider( allocation_providers[provider.name], - at, label.subnodes) + at, 0) tlps[agt] = (target, label, self.config.providers[provider.name]) else: @@ -1470,19 +1287,6 @@ class NodePool(threading.Thread): self.log.debug("Finished node launch calculation") return nodes_to_launch - def getNeededSubNodes(self, session): - nodes_to_launch = [] - for node in session.getNodes(): - if node.label_name in self.config.labels: - expected_subnodes = \ - self.config.labels[node.label_name].subnodes - active_subnodes = len([n for n in node.subnodes - if n.state != nodedb.DELETE]) - deficit = max(expected_subnodes - active_subnodes, 0) - if deficit: - nodes_to_launch.append((node, deficit)) - return nodes_to_launch - def updateConfig(self): config = self.loadConfig() self.reconfigureZooKeeper(config) @@ -1527,16 +1331,6 @@ class NodePool(threading.Thread): self._wake_condition.release() def _run(self, session, allocation_history): - # Make up the subnode deficit first to make sure that an - # already allocated node has priority in filling its subnodes - # ahead of new nodes. - subnodes_to_launch = self.getNeededSubNodes(session) - for (node, num_to_launch) in subnodes_to_launch: - self.log.info("Need to launch %s subnodes for node id: %s" % - (num_to_launch, node.id)) - for i in range(num_to_launch): - self.launchSubNode(session, node) - nodes_to_launch = self.getNeededNodes(session, allocation_history) for (tlp, num_to_launch) in nodes_to_launch: @@ -1575,39 +1369,6 @@ class NodePool(threading.Thread): launch_timeout) t.start() - def launchSubNode(self, session, node): - try: - self._launchSubNode(session, node) - except Exception: - self.log.exception( - "Could not launch subnode for node id: %s", node.id) - - def _launchSubNode(self, session, node): - provider = self.config.providers[node.provider_name] - label = self.config.labels[node.label_name] - timeout = provider.boot_timeout - launch_timeout = provider.launch_timeout - subnode = session.createSubNode(node) - t = SubNodeLauncher(self, provider, label, subnode.id, - node.id, node.target_name, timeout, launch_timeout, - node_az=node.az, manager_name=node.manager_name) - t.start() - - def deleteSubNode(self, subnode, manager): - # Don't try too hard here, the actual node deletion will make - # sure this is cleaned up. - if subnode.external_id: - try: - self.log.debug('Deleting server %s for subnode id: ' - '%s of node id: %s' % - (subnode.external_id, subnode.id, - subnode.node.id)) - manager.cleanupServer(subnode.external_id) - manager.waitForServerDeletion(subnode.external_id) - except provider_manager.NotFound: - pass - subnode.delete() - def deleteNode(self, node_id): try: self._delete_threads_lock.acquire() @@ -1654,16 +1415,6 @@ class NodePool(threading.Thread): self.log.exception("Exception revoking node id: %s" % node.id) - for subnode in node.subnodes: - if subnode.external_id: - try: - self.log.debug('Deleting server %s for subnode id: ' - '%s of node id: %s' % - (subnode.external_id, subnode.id, node.id)) - manager.cleanupServer(subnode.external_id) - except provider_manager.NotFound: - pass - if node.external_id: try: self.log.debug('Deleting server %s for node id: %s' % @@ -1674,11 +1425,6 @@ class NodePool(threading.Thread): pass node.external_id = None - for subnode in node.subnodes: - if subnode.external_id: - manager.waitForServerDeletion(subnode.external_id) - subnode.delete() - node.delete() self.log.info("Deleted node id: %s" % node.id) @@ -1886,7 +1632,7 @@ class NodePool(threading.Thread): continue state = nodedb.STATE_NAMES[node.state] key = 'nodepool.nodes.%s' % state - total_nodes = self.config.labels[node.label_name].subnodes + 1 + total_nodes = 1 states[key] += total_nodes # NOTE(pabelanger): Check if we assign nodes via Gearman if so, use diff --git a/nodepool/tests/fixtures/config_validate/good.yaml b/nodepool/tests/fixtures/config_validate/good.yaml index dd9cafeee..087bdfa9e 100644 --- a/nodepool/tests/fixtures/config_validate/good.yaml +++ b/nodepool/tests/fixtures/config_validate/good.yaml @@ -21,7 +21,6 @@ labels: - name: trusty-2-node image: trusty ready-script: multinode_setup.sh - subnodes: 1 min-ready: 0 providers: - name: cloud1 diff --git a/nodepool/tests/fixtures/config_validate/yaml_error.yaml b/nodepool/tests/fixtures/config_validate/yaml_error.yaml index 08dd9626c..c8996334e 100644 --- a/nodepool/tests/fixtures/config_validate/yaml_error.yaml +++ b/nodepool/tests/fixtures/config_validate/yaml_error.yaml @@ -21,7 +21,6 @@ labels: - name: trusty-2-node image: trusty ready-script: multinode_setup.sh - subnodes: 1 min-ready: 0 providers: - name: cloud1 diff --git a/nodepool/tests/fixtures/subnodes.yaml b/nodepool/tests/fixtures/subnodes.yaml deleted file mode 100644 index 53c9ff8b3..000000000 --- a/nodepool/tests/fixtures/subnodes.yaml +++ /dev/null @@ -1,61 +0,0 @@ -elements-dir: . -images-dir: '{images_dir}' - -cron: - check: '*/15 * * * *' - cleanup: '*/1 * * * *' - -zookeeper-servers: - - host: {zookeeper_host} - port: {zookeeper_port} - chroot: {zookeeper_chroot} - -labels: - - name: fake-label - image: fake-image - min-ready: 2 - providers: - - name: fake-provider - - name: multi-fake - image: fake-image - ready-script: multinode_setup.sh - subnodes: 2 - min-ready: 2 - providers: - - name: fake-provider - -providers: - - name: fake-provider - region-name: fake-region - keypair: 'if-present-use-this-keypair' - username: 'fake' - password: 'fake' - auth-url: 'fake' - project-id: 'fake' - max-servers: 96 - pool: 'fake' - networks: - - net-id: 'some-uuid' - rate: 0.0001 - images: - - name: fake-image - min-ram: 8192 - name-filter: 'Fake' - meta: - key: value - key2: value - -targets: - - name: fake-target - -diskimages: - - name: fake-image - elements: - - fedora - - vm - release: 21 - env-vars: - TMPDIR: /opt/dib_tmp - DIB_IMAGE_CACHE: /opt/dib_cache - DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/ - BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2 diff --git a/nodepool/tests/test_allocator.py b/nodepool/tests/test_allocator.py index cdcdc408b..23279f606 100644 --- a/nodepool/tests/test_allocator.py +++ b/nodepool/tests/test_allocator.py @@ -40,7 +40,7 @@ class OneLabel(tests.AllocatorTestCase, tests.BaseTestCase): at1 = allocation.AllocationTarget('target1') ar1 = allocation.AllocationRequest('label1', self.label1) ar1.addTarget(at1, 0) - self.agt.append(ar1.addProvider(ap1, at1, 0)[1]) + self.agt.append(ar1.addProvider(ap1, at1)[1]) ap1.makeGrants() @@ -67,8 +67,8 @@ class TwoLabels(tests.AllocatorTestCase, tests.BaseTestCase): ar2 = allocation.AllocationRequest('label2', self.label2) ar1.addTarget(at1, 0) ar2.addTarget(at1, 0) - self.agt.append(ar1.addProvider(ap1, at1, 0)[1]) - self.agt.append(ar2.addProvider(ap1, at1, 0)[1]) + self.agt.append(ar1.addProvider(ap1, at1)[1]) + self.agt.append(ar2.addProvider(ap1, at1)[1]) ap1.makeGrants() @@ -115,10 +115,10 @@ class TwoProvidersTwoLabels(tests.AllocatorTestCase, tests.BaseTestCase): ar2 = allocation.AllocationRequest('label2', self.label2) ar1.addTarget(at1, 0) ar2.addTarget(at1, 0) - self.agt.append(ar1.addProvider(ap1, at1, 0)[1]) - self.agt.append(ar2.addProvider(ap1, at1, 0)[1]) - self.agt.append(ar1.addProvider(ap2, at1, 0)[1]) - self.agt.append(ar2.addProvider(ap2, at1, 0)[1]) + self.agt.append(ar1.addProvider(ap1, at1)[1]) + self.agt.append(ar2.addProvider(ap1, at1)[1]) + self.agt.append(ar1.addProvider(ap2, at1)[1]) + self.agt.append(ar2.addProvider(ap2, at1)[1]) ap1.makeGrants() ap2.makeGrants() @@ -170,9 +170,9 @@ class TwoProvidersTwoLabelsOneShared(tests.AllocatorTestCase, ar2 = allocation.AllocationRequest('label2', self.label2) ar1.addTarget(at1, 0) ar2.addTarget(at1, 0) - self.agt.append(ar1.addProvider(ap1, at1, 0)[1]) - self.agt.append(ar2.addProvider(ap1, at1, 0)[1]) - self.agt.append(ar2.addProvider(ap2, at1, 0)[1]) + self.agt.append(ar1.addProvider(ap1, at1)[1]) + self.agt.append(ar2.addProvider(ap1, at1)[1]) + self.agt.append(ar2.addProvider(ap2, at1)[1]) ap1.makeGrants() ap2.makeGrants() @@ -293,8 +293,8 @@ class RoundRobinAllocation(tests.RoundRobinTestCase, tests.BaseTestCase): # providers for ar in ars: ar.addTarget(at1, 0) - ar.addProvider(ap1, at1, 0) - ar.addProvider(ap2, at1, 0) + ar.addProvider(ap1, at1) + ar.addProvider(ap2, at1) ap1.makeGrants() for g in ap1.grants: @@ -414,15 +414,15 @@ class RoundRobinFixedProvider(tests.RoundRobinTestCase, tests.BaseTestCase): # first ar can only go to provider1, the last only to # provider2 ars[0].addTarget(at1, 0) - ars[0].addProvider(ap1, at1, 0) + ars[0].addProvider(ap1, at1) ars[-1].addTarget(at1, 0) - ars[-1].addProvider(ap2, at1, 0) + ars[-1].addProvider(ap2, at1) # the rest can go anywhere for ar in ars[1:-1]: ar.addTarget(at1, 0) - ar.addProvider(ap1, at1, 0) - ar.addProvider(ap2, at1, 0) + ar.addProvider(ap1, at1) + ar.addProvider(ap2, at1) ap1.makeGrants() for g in ap1.grants: diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index c614a9cbd..c82cba68f 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -214,69 +214,6 @@ class TestNodepool(tests.DBTestCase): state=nodedb.READY) self.assertEqual(len(nodes), 2) - @skip("Disabled for early v3 development") - def test_subnodes(self): - """Test that an image and node are created""" - configfile = self.setup_config('subnodes.yaml') - pool = self.useNodepool(configfile, watermark_sleep=1) - self._useBuilder(configfile) - pool.start() - self.waitForImage('fake-provider', 'fake-image') - self.waitForNodes(pool) - - with pool.getDB().getSession() as session: - nodes = session.getNodes(provider_name='fake-provider', - label_name='fake-label', - target_name='fake-target', - state=nodedb.READY) - self.assertEqual(len(nodes), 2) - nodes = session.getNodes(provider_name='fake-provider', - label_name='multi-fake', - target_name='fake-target', - state=nodedb.READY) - self.assertEqual(len(nodes), 2) - for node in nodes: - self.assertEqual(len(node.subnodes), 2) - for subnode in node.subnodes: - self.assertEqual(subnode.state, nodedb.READY) - - @skip("Disabled for early v3 development") - def test_subnode_deletion_success(self): - """Test that subnodes are deleted with parent node""" - configfile = self.setup_config('subnodes.yaml') - pool = self.useNodepool(configfile, watermark_sleep=1) - self._useBuilder(configfile) - pool.start() - self.waitForImage('fake-provider', 'fake-image') - self.waitForNodes(pool) - - subnode_ids = [] - node_ids = [] - - with pool.getDB().getSession() as session: - nodes = session.getNodes(provider_name='fake-provider', - label_name='multi-fake', - target_name='fake-target', - state=nodedb.READY) - self.assertEqual(len(nodes), 2) - for node in nodes: - self.assertEqual(len(node.subnodes), 2) - for subnode in node.subnodes: - self.assertEqual(subnode.state, nodedb.READY) - subnode_ids.append(subnode.id) - node_ids.append(node.id) - - for node_id in node_ids: - pool.deleteNode(node_id) - - self.wait_for_threads() - self.waitForNodes(pool) - - with pool.getDB().getSession() as session: - for subnode_id in subnode_ids: - s = session.getSubNode(subnode_id) - self.assertIsNone(s) - @skip("Disabled for early v3 development") def test_node_az(self): """Test that an image and node are created with az specified""" diff --git a/tools/fake.yaml b/tools/fake.yaml index faf6e87c5..94425ce63 100644 --- a/tools/fake.yaml +++ b/tools/fake.yaml @@ -28,7 +28,6 @@ labels: - name: multi-fake image: fake-nodepool ready-script: multinode_setup.sh - subnodes: 2 min-ready: 2 providers: - name: fake-provider From e35e2f14ebb812bb3d90a87e28a4e05d62e9504e Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Thu, 9 Feb 2017 08:29:18 -0500 Subject: [PATCH 037/309] Implement node launching This implements actual node launching by NodeLauncher and also adds support for launching min-ready nodes. The min-ready nodes functionality is implemented by submitting node requests for the labels defined in the nodepool config. Since we now create node requests, and not just update them, the ZK API method updateNodeRequest() is renamed storeNodeRequest(). Change-Id: I72a7b85f8560e996124066a1e3bc35886f867f7e --- nodepool/nodepool.py | 323 ++++++++++++++++++++--- nodepool/tests/__init__.py | 15 +- nodepool/tests/test_nodelaunchmanager.py | 43 ++- nodepool/tests/test_nodepool.py | 16 +- nodepool/tests/test_zk.py | 56 ++++ nodepool/zk.py | 80 +++++- 6 files changed, 459 insertions(+), 74 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 92c5ad038..42fd443db 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -672,15 +672,180 @@ class SubNodeLauncher(threading.Thread): class NodeLauncher(threading.Thread): - def __init__(self, zk, node, retries): - threading.Thread.__init__(self) + + def __init__(self, zk, provider, label, provider_manager, node, retries): + ''' + Initialize the launcher. + + :param ZooKeeper zk: A ZooKeeper object. + :param Provider provider: A config Provider object. + :param Label label: The Label object for this node type. + :param ProviderManager provider_manager: The manager object used to + interact with the selected provider. + :param Node node: The node object. + :param int retries: Number of times to retry failed launches. + ''' + threading.Thread.__init__(self, name="NodeLauncher-%s" % node.id) + self.log = logging.getLogger("nodepool.NodeLauncher-%s" % node.id) self._zk = zk + self._provider = provider + self._label = label + self._manager = provider_manager self._node = node self._retries = retries def _launchNode(self): - # TODO(Shrews): Use self._retries here - pass + config_image = self._provider.images[self._label.image] + + cloud_image = self._zk.getMostRecentImageUpload( + config_image.name, self._provider.name) + if not cloud_image: + raise LaunchNodepoolException( + "Unable to find current cloud image %s in %s" % + (config_image.name, self._provider.name) + ) + + hostname = self._provider.hostname_format.format( + label=self._label, provider=self._provider, node=self._node + ) + + self.log.info("Creating server with hostname %s in %s from image %s " + "for node id: %s" % (hostname, self._provider.name, + config_image.name, self._node.id)) + + server = self._manager.createServer( + hostname, + config_image.min_ram, + cloud_image.external_id, + name_filter=config_image.name_filter, + az=self._node.az, + config_drive=config_image.config_drive, + nodepool_node_id=self._node.id, + nodepool_image_name=config_image.name) + + self._node.external_id = server.id + self._node.hostname = hostname + + # Checkpoint save the updated node info + self._zk.storeNode(self._node) + + self.log.debug("Waiting for server %s for node id: %s" % + (server.id, self._node.id)) + server = self._manager.waitForServer( + server, self._provider.launch_timeout) + + if server.status != 'ACTIVE': + raise LaunchStatusException("Server %s for node id: %s " + "status: %s" % + (server.id, self._node.id, + server.status)) + + self._node.public_ipv4 = server.public_v4 + self._node.public_ipv6 = server.public_v6 + + preferred_ip = server.public_v4 + if self._provider.ipv6_preferred: + if server.public_v6: + preferred_ip = server.public_v6 + else: + self.log.warning('Preferred ipv6 not available, ' + 'falling back to ipv4.') + if not preferred_ip: + self.log.debug( + "Server data for failed IP: %s" % pprint.pformat( + server)) + raise LaunchNetworkException("Unable to find public IP of server") + + self._node.private_ipv4 = server.private_v4 + # devstack-gate multi-node depends on private_v4 being populated + # with something. On clouds that don't have a private address, use + # the public. + if not self._node.private_ipv4: + self._node.private_ipv4 = server.public_v4 + + # Checkpoint save the updated node info + self._zk.storeNode(self._node) + + self.log.debug("Node id: %s is running, ipv4: %s, ipv6: %s" % + (self._node.id, self._node.public_ipv4, + self._node.public_ipv6)) + + self.log.debug("Node id: %s testing ssh at ip: %s" % + (self._node.id, preferred_ip)) + host = utils.ssh_connect( + preferred_ip, config_image.username, + connect_kwargs=dict(key_filename=config_image.private_key), + timeout=self._provider.boot_timeout) + if not host: + raise LaunchAuthException("Unable to connect via ssh") + + self._writeNodepoolInfo(host, preferred_ip, self._node) + if self._label.ready_script: + self.runReadyScript(host, hostname, self._label.ready_script) + + def _writeNodepoolInfo(self, host, preferred_ip, node): + key = paramiko.RSAKey.generate(2048) + public_key = key.get_name() + ' ' + key.get_base64() + host.ssh("test for config dir", "ls /etc/nodepool") + + ftp = host.client.open_sftp() + + # The IP of this node + f = ftp.open('/etc/nodepool/node', 'w') + f.write(preferred_ip + '\n') + f.close() + # The private IP of this node + f = ftp.open('/etc/nodepool/node_private', 'w') + f.write(node.private_ipv4 + '\n') + f.close() + # The SSH key for this node set + f = ftp.open('/etc/nodepool/id_rsa', 'w') + key.write_private_key(f) + f.close() + f = ftp.open('/etc/nodepool/id_rsa.pub', 'w') + f.write(public_key + '\n') + f.close() + # Provider information for this node set + f = ftp.open('/etc/nodepool/provider', 'w') + f.write('NODEPOOL_PROVIDER=%s\n' % self._provider.name) + f.write('NODEPOOL_CLOUD=%s\n' % self._provider.cloud_config.name) + f.write('NODEPOOL_REGION=%s\n' % ( + self._provider.region_name or '',)) + f.write('NODEPOOL_AZ=%s\n' % (node.az or '',)) + f.close() + # The instance UUID for this node + f = ftp.open('/etc/nodepool/uuid', 'w') + f.write(node.external_id + '\n') + f.close() + + ftp.close() + + def _runReadyScript(self, host, hostname, script): + env_vars = '' + for k, v in os.environ.items(): + if k.startswith('NODEPOOL_'): + env_vars += ' %s="%s"' % (k, v) + host.ssh("run ready script", + "cd /opt/nodepool-scripts && %s ./%s %s" % + (env_vars, script, hostname), + output=True) + + def _run(self): + attempts = 1 + while attempts <= self._retries: + try: + self._launchNode() + break + except Exception: + self.log.exception("Launch attempt %d/%d failed for node %s:", + attempts, self._retries, self._node.id) + if attempts == self._retries: + raise + attempts += 1 + + self._node.state = zk.READY + self._zk.storeNode(self._node) + self.log.info("Node id %s is ready", self._node.id) def run(self): try: @@ -689,23 +854,31 @@ class NodeLauncher(threading.Thread): self._node.state = zk.FAILED self._zk.storeNode(self._node) - def _run(self): - self._launchNode() - self._node.state = zk.READY - self._zk.storeNode(self._node) - class NodeLaunchManager(object): ''' Handle launching multiple nodes in parallel. ''' - def __init__(self, zk, retries): - self._zk = zk + def __init__(self, zk, provider, labels, provider_manager, retries): + ''' + Initialize the launch manager. + + :param ZooKeeper zk: A ZooKeeper object. + :param Provider provider: A config Provider object. + :param dict labels: A dict of config Label objects. + :param ProviderManager provider_manager: The manager object used to + interact with the selected provider. + :param int retries: Number of times to retry failed launches. + ''' self._retries = retries self._nodes = [] self._failed_nodes = [] self._ready_nodes = [] self._threads = [] + self._zk = zk + self._provider = provider + self._labels = labels + self._manager = provider_manager @property def alive_thread_count(self): @@ -734,7 +907,9 @@ class NodeLaunchManager(object): :param Node node: The node object. ''' self._nodes.append(node) - t = NodeLauncher(self._zk, node, self._retries) + label = self._labels[node.type] + t = NodeLauncher(self._zk, self._provider, label, self._manager, + node, self._retries) t.start() self._threads.append(t) @@ -784,6 +959,7 @@ class NodeRequestHandler(object): self.log = logging.getLogger("nodepool.NodeRequestHandler") self.provider = pw.provider self.zk = pw.zk + self.labels = pw.labels self.manager = pw.manager self.launcher_id = pw.launcher_id self.request = request @@ -800,7 +976,13 @@ class NodeRequestHandler(object): :returns: True if it is available, False otherwise. ''' - for img in self.request.node_types: + for label in self.request.node_types: + try: + img = self.labels[label].image + except KeyError: + self.log.error("Node type %s not a defined label", label) + return False + if not self.zk.getMostRecentImageUpload(img, self.provider.name): return False return True @@ -841,26 +1023,6 @@ class NodeRequestHandler(object): except Exception: self.log.exception("Error unlocking node:") - def _getReadyNodesOfTypes(self, ntypes): - ''' - Query ZooKeeper for unused/ready nodes. - - :param str ntypes: The node types we want. - - :returns: A dictionary, keyed by node type, with lists of Node objects - that are ready, or an empty dict if none are found. - ''' - ret = {} - for node_id in self.zk.getNodes(): - node = self.zk.getNode(node_id) - if (node and node.state == zk.READY and - not node.allocated_to and node.type in ntypes - ): - if node.type not in ret: - ret[node.type] = [] - ret[node.type].append(node) - return ret - def _run(self): ''' Main body for the NodeRequestHandler. @@ -880,16 +1042,17 @@ class NodeRequestHandler(object): if launchers.issubset(set(self.request.declined_by)): # All launchers have declined it self.request.state = zk.FAILED - self.zk.updateNodeRequest(self.request) + self.zk.storeNodeRequest(self.request) self.zk.unlockNodeRequest(self.request) self.done = True return self.request.state = zk.PENDING - self.zk.updateNodeRequest(self.request) + self.zk.storeNodeRequest(self.request) - self.launch_manager = NodeLaunchManager(self.zk, retries=3) - ready_nodes = self._getReadyNodesOfTypes(self.request.node_types) + self.launch_manager = NodeLaunchManager( + self.zk, self.provider, self.labels, self.manager, retries=3) + ready_nodes = self.zk.getReadyNodesOfTypes(self.request.node_types) for ntype in self.request.node_types: # First try to grab from the list of already available nodes. @@ -917,7 +1080,7 @@ class NodeRequestHandler(object): node.allocated_to = self.request.id # Note: It should be safe (i.e., no race) to lock the node - # *after* it is stored since nodes in BUILDING state are not + # *after* it is stored since nodes in INIT state are not # locked anywhere. self.zk.storeNode(node) self.zk.lockNode(node, blocking=False) @@ -941,7 +1104,7 @@ class NodeRequestHandler(object): except Exception: self.log.exception("Exception in NodeRequestHandler:") self.request.state = zk.FAILED - self.zk.updateNodeRequest(self.request) + self.zk.storeNodeRequest(self.request) self.zk.unlockNodeRequest(self.request) self.done = True @@ -988,7 +1151,7 @@ class NodeRequestHandler(object): self.request.state = zk.FULFILLED self._unlockNodeSet() - self.zk.updateNodeRequest(self.request) + self.zk.storeNodeRequest(self.request) self.zk.unlockNodeRequest(self.request) return True @@ -1017,6 +1180,7 @@ class ProviderWorker(threading.Thread): # These attributes will be used by NodeRequestHandler self.zk = zk self.manager = None + self.labels = None self.provider = provider self.launcher_id = "%s-%s-%s" % (socket.gethostname(), os.getpid(), @@ -1035,6 +1199,7 @@ class ProviderWorker(threading.Thread): this thread to terminate. ''' config = nodepool_config.loadConfig(self.configfile) + self.labels = config.labels if self.provider.name not in config.providers.keys(): self.log.info("Provider %s removed from config" @@ -1182,6 +1347,7 @@ class NodePool(threading.Thread): self._instance_delete_threads = {} self._instance_delete_threads_lock = threading.Lock() self._wake_condition = threading.Condition() + self._submittedRequests = {} def stop(self): self._stopped = True @@ -1486,9 +1652,82 @@ class NodePool(threading.Thread): def updateConfig(self): config = self.loadConfig() self.reconfigureZooKeeper(config) - self.reconfigureCrons(config) self.setConfig(config) + def removeCompletedRequests(self): + ''' + Remove (locally and in ZK) fulfilled node requests. + + We also must reset the allocated_to attribute for each Node assigned + to our request, since we are deleting the request. + ''' + for label in self._submittedRequests.keys(): + req = self._submittedRequests[label] + self._submittedRequests[label] = self.zk.getNodeRequest(req.id) + + if self._submittedRequests[label]: + if self._submittedRequests[label].state == zk.FULFILLED: + self.log.debug("min-ready node request for %s fulfilled", label) + # Reset node allocated_to + for node_id in self._submittedRequests[label].nodes: + node = self.zk.getNode(node_id) + node.allocated_to = None + # NOTE: locking shouldn't be necessary since a node with + # allocated_to set should not be locked except by the + # creator of the request (us). + self.zk.storeNode(node) + self.zk.deleteNodeRequest(self._submittedRequests[label]) + del self._submittedRequests[label] + elif self._submittedRequests[label].state == zk.FAILED: + self.log.debug("min-ready node request for %s failed", label) + self.zk.deleteNodeRequest(self._submittedRequests[label]) + del self._submittedRequests[label] + + def createMinReady(self): + ''' + Create node requests to make the minimum amount of ready nodes. + + Since this method will be called repeatedly, we need to take care to + note when we have already submitted node requests to satisfy min-ready. + Requests we've already submitted are stored in the _submittedRequests + dict, keyed by label. + ''' + def createRequest(label_name, count): + req = zk.NodeRequest() + req.state = zk.REQUESTED + for i in range(0, count): + req.node_types.append(label_name) + self.zk.storeNodeRequest(req) + self._submittedRequests[label_name] = req + + # Since we could have already submitted node requests, do not + # resubmit a request for a type if a request for that type is + # still in progress. + self.removeCompletedRequests() + label_names = self.config.labels.keys() + requested_labels = self._submittedRequests.keys() + needed_labels = list(set(label_names) - set(requested_labels)) + + ready_nodes = self.zk.getReadyNodesOfTypes(needed_labels) + + for label in self.config.labels.values(): + if label.name not in needed_labels: + continue + min_ready = label.min_ready + if min_ready == -1: + continue # disabled + + # Calculate how many nodes of this type we need created + need = 0 + if label.name not in ready_nodes.keys(): + need = label.min_ready + elif len(ready_nodes[label.name]) < min_ready: + need = min_ready - len(ready_nodes[label.name]) + + if need: + self.log.info("Creating request for %d %s nodes", need, label.name) + createRequest(label.name, need) + def run(self): ''' Start point for the NodePool thread. @@ -1502,6 +1741,8 @@ class NodePool(threading.Thread): self.log.info("ZooKeeper suspended. Waiting") time.sleep(SUSPEND_WAIT_TIME) + self.createMinReady() + # Start (or restart) provider threads for each provider in # the config. Removing a provider from the config and then # adding it back would cause a restart. diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py index 89047ed17..c8b369ea6 100644 --- a/nodepool/tests/__init__.py +++ b/nodepool/tests/__init__.py @@ -188,6 +188,10 @@ class BaseTestCase(testtools.TestCase): continue if t.name.startswith("CleanupWorker"): continue + if t.name.startswith("ProviderWorker"): + continue + if t.name.startswith("NodeLauncher"): + continue if t.name not in whitelist: done = False if done: @@ -430,17 +434,6 @@ class DBTestCase(BaseTestCase): time.sleep(1) self.wait_for_threads() - def submitNodeRequest(self, req): - ''' - Very simple submit of a node request to ZooKeeper. - ''' - priority = 100 - req.state = zk.REQUESTED - path = '%s/%s-' % (self.zk.REQUEST_ROOT, priority) - path = self.zk.client.create(path, req.serialize(), makepath=True, - sequence=True, ephemeral=True) - req.id = path.split("/")[-1] - def waitForNodeRequest(self, req): ''' Wait for a node request to transition to a final state. diff --git a/nodepool/tests/test_nodelaunchmanager.py b/nodepool/tests/test_nodelaunchmanager.py index db87121cf..3236eed46 100644 --- a/nodepool/tests/test_nodelaunchmanager.py +++ b/nodepool/tests/test_nodelaunchmanager.py @@ -17,6 +17,8 @@ import logging import mock import time +from nodepool import builder +from nodepool import provider_manager from nodepool import tests from nodepool import zk from nodepool.nodepool import NodeLaunchManager @@ -25,10 +27,34 @@ from nodepool.nodepool import NodeLaunchManager class TestNodeLaunchManager(tests.DBTestCase): log = logging.getLogger("nodepool.TestNodeLaunchManager") + def _setup(self, configfile): + # Need a builder for the launch code to work and to access + # config objects. + b = builder.NodePoolBuilder(configfile) + b.cleanup_interval = .5 + b.build_interval = .1 + b.upload_interval = .1 + b.dib_cmd = 'nodepool/tests/fake-image-create' + b.start() + self.addCleanup(b.stop) + self.waitForImage('fake-provider', 'fake-image') + + self.provider = b._config.providers['fake-provider'] + self.labels = b._config.labels + + # The builder config does not have a provider manager, so create one. + self.pmanager = provider_manager.ProviderManager(self.provider, False) + self.pmanager.resetClient() + def test_successful_launch(self): + configfile = self.setup_config('node.yaml') + self._setup(configfile) + n1 = zk.Node() n1.state = zk.BUILDING - mgr = NodeLaunchManager(self.zk, 0) + n1.type = 'fake-label' + mgr = NodeLaunchManager(self.zk, self.provider, self.labels, + self.pmanager, 1) mgr.launch(n1) while not mgr.poll(): time.sleep(0) @@ -37,10 +63,15 @@ class TestNodeLaunchManager(tests.DBTestCase): @mock.patch('nodepool.nodepool.NodeLauncher._launchNode') def test_failed_launch(self, mock_launch): + configfile = self.setup_config('node.yaml') + self._setup(configfile) + mock_launch.side_effect = Exception() n1 = zk.Node() n1.state = zk.BUILDING - mgr = NodeLaunchManager(self.zk, 0) + n1.type = 'fake-label' + mgr = NodeLaunchManager(self.zk, self.provider, self.labels, + self.pmanager, 1) mgr.launch(n1) while not mgr.poll(): time.sleep(0) @@ -49,12 +80,18 @@ class TestNodeLaunchManager(tests.DBTestCase): @mock.patch('nodepool.nodepool.NodeLauncher._launchNode') def test_mixed_launch(self, mock_launch): + configfile = self.setup_config('node.yaml') + self._setup(configfile) + mock_launch.side_effect = [None, Exception()] n1 = zk.Node() n1.state = zk.BUILDING + n1.type = 'fake-label' n2 = zk.Node() n2.state = zk.BUILDING - mgr = NodeLaunchManager(self.zk, 0) + n2.type = 'fake-label' + mgr = NodeLaunchManager(self.zk, self.provider, self.labels, + self.pmanager, 1) mgr.launch(n1) mgr.launch(n2) while not mgr.poll(): diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index c614a9cbd..8feecfd72 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -44,9 +44,9 @@ class TestNodepool(tests.DBTestCase): pool.start() req = zk.NodeRequest() - req.node_types.append('fake-image') - self.submitNodeRequest(req) - self.assertEqual(req.state, zk.REQUESTED) + req.state = zk.REQUESTED + req.node_types.append('fake-label') + self.zk.storeNodeRequest(req) req = self.waitForNodeRequest(req) self.assertEqual(req.state, zk.FULFILLED) @@ -75,9 +75,9 @@ class TestNodepool(tests.DBTestCase): pool.start() req = zk.NodeRequest() - req.node_types.append('fake-image') - self.submitNodeRequest(req) - self.assertEqual(req.state, zk.REQUESTED) + req.state = zk.REQUESTED + req.node_types.append('fake-label') + self.zk.storeNodeRequest(req) req = self.waitForNodeRequest(req) self.assertTrue(mock_launch.called) @@ -93,9 +93,9 @@ class TestNodepool(tests.DBTestCase): pool.start() req = zk.NodeRequest() + req.state = zk.REQUESTED req.node_types.append("zorky-zumba") - self.submitNodeRequest(req) - self.assertEqual(req.state, zk.REQUESTED) + self.zk.storeNodeRequest(req) req = self.waitForNodeRequest(req) self.assertEqual(req.state, zk.FAILED) diff --git a/nodepool/tests/test_zk.py b/nodepool/tests/test_zk.py index 8bdf33ab6..edecf2c5d 100644 --- a/nodepool/tests/test_zk.py +++ b/nodepool/tests/test_zk.py @@ -523,6 +523,54 @@ class TestZooKeeper(tests.DBTestCase): node2 = self.zk.getNode(node.id) self.assertEqual(node, node2) + def _create_node_request(self): + req = zk.NodeRequest() + req.state = zk.REQUESTED + req.node_types.append('label1') + self.zk.storeNodeRequest(req) + self.assertIsNotNone( + self.zk.client.exists(self.zk._requestPath(req.id)) + ) + return req + + def test_storeNodeRequest(self): + req = self._create_node_request() + req2 = self.zk.getNodeRequest(req.id) + self.assertEqual(req, req2) + + def test_storeNodeRequest_update(self): + req = self._create_node_request() + req.state = zk.FULFILLED + self.zk.storeNodeRequest(req) + self.assertIsNotNone(req.id) + req2 = self.zk.getNodeRequest(req.id) + self.assertEqual(req, req2) + + def test_deleteNodeRequest(self): + req = self._create_node_request() + self.zk.deleteNodeRequest(req) + self.assertIsNone( + self.zk.client.exists(self.zk._requestPath(req.id)) + ) + + def test_getReadyNodesOfTypes(self): + n1 = self._create_node() + n1.type = 'label1' + self.zk.storeNode(n1) + n2 = self._create_node() + n2.state = zk.READY + n2.type = 'label1' + self.zk.storeNode(n2) + n3 = self._create_node() + n3.state = zk.READY + n3.type = 'label2' + self.zk.storeNode(n3) + + r = self.zk.getReadyNodesOfTypes(['label1']) + self.assertIn('label1', r) + self.assertEqual(1, len(r['label1'])) + self.assertEqual(n2, r['label1'][0]) + class TestZKModel(tests.BaseTestCase): @@ -658,6 +706,8 @@ class TestZKModel(tests.BaseTestCase): o.public_ipv6 = '' o.image_id = 'image-id' o.launcher = 'launcher-id' + o.external_id = 'ABCD' + o.hostname = 'xyz' d = o.toDict() self.assertNotIn('id', d) @@ -673,6 +723,8 @@ class TestZKModel(tests.BaseTestCase): self.assertEqual(d['public_ipv6'], o.public_ipv6) self.assertEqual(d['image_id'], o.image_id) self.assertEqual(d['launcher'], o.launcher) + self.assertEqual(d['external_id'], o.external_id) + self.assertEqual(d['hostname'], o.hostname) def test_Node_fromDict(self): now = int(time.time()) @@ -690,6 +742,8 @@ class TestZKModel(tests.BaseTestCase): 'public_ipv6': '', 'image_id': 'image-id', 'launcher': 'launcher-id', + 'external_id': 'ABCD', + 'hostname': 'xyz', } o = zk.Node.fromDict(d, node_id) @@ -706,3 +760,5 @@ class TestZKModel(tests.BaseTestCase): self.assertEqual(o.public_ipv6, d['public_ipv6']) self.assertEqual(o.image_id, d['image_id']) self.assertEqual(o.launcher, d['launcher']) + self.assertEqual(o.external_id, d['external_id']) + self.assertEqual(o.hostname , d['hostname']) diff --git a/nodepool/zk.py b/nodepool/zk.py index 0a7e2e398..f71fcd1a6 100644 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -327,6 +327,15 @@ class NodeRequest(BaseModel): d['stat'] = self.stat return '' % d + def __eq__(self, other): + if isinstance(other, NodeRequest): + return (self.id == other.id and + self.declined_by == other.declined_by and + self.node_types == other.node_types and + self.nodes == other.nodes) + else: + return False + def toDict(self): ''' Convert a NodeRequest object's attributes to a dictionary. @@ -375,6 +384,8 @@ class Node(BaseModel): self.image_id = None self.launcher = None self.created_time = None + self.external_id = None + self.hostname = None def __repr__(self): d = self.toDict() @@ -396,7 +407,9 @@ class Node(BaseModel): self.public_ipv6 == other.public_ipv6 and self.image_id == other.image_id and self.launcher == other.launcher and - self.created_time == other.created_time) + self.created_time == other.created_time and + self.external_id == other.external_id and + self.hostname == other.hostname) else: return False @@ -415,6 +428,8 @@ class Node(BaseModel): d['image_id'] = self.image_id d['launcher'] = self.launcher d['created_time'] = self.created_time + d['external_id'] = self.external_id + d['hostname'] = self.hostname return d @staticmethod @@ -439,6 +454,8 @@ class Node(BaseModel): o.image_id = d.get('image_id') o.launcher = d.get('launcher') o.created_time = d.get('created_time') + o.external_id = d.get('external_id') + o.hostname = d.get('hostname') return o @@ -1233,24 +1250,45 @@ class ZooKeeper(object): d.stat = stat return d - def updateNodeRequest(self, request): + def storeNodeRequest(self, request, priority="100"): ''' - Update a node request. - - The request must already be locked before updating. + Store a new or existing node request. :param NodeRequest request: The node request to update. + :param str priority: Priority of a new request. Ignored on updates. ''' - if request.lock is None: - raise Exception("%s must be locked before updating." % request) + if not request.id: + path = "%s/%s-" % (self.REQUEST_ROOT, priority) + path = self.client.create( + path, + value=request.serialize(), + sequence=True, + makepath=True) + request.id = path.split("/")[-1] # Validate it still exists before updating - if not self.getNodeRequest(request.id): - raise Exception( - "Attempt to update non-existing request %s" % request) + else: + if not self.getNodeRequest(request.id): + raise Exception( + "Attempt to update non-existing request %s" % request) + + path = self._requestPath(request.id) + self.client.set(path, request.serialize()) + + def deleteNodeRequest(self, request): + ''' + Delete a node request. + + :param NodeRequest request: The request to delete. + ''' + if not request.id: + return path = self._requestPath(request.id) - self.client.set(path, request.serialize()) + try: + self.client.delete(path) + except kze.NoNodeError: + pass def lockNodeRequest(self, request, blocking=True, timeout=None): ''' @@ -1406,3 +1444,23 @@ class ZooKeeper(object): else: path = self._nodePath(node.id) self.client.set(path, node.serialize()) + + def getReadyNodesOfTypes(self, labels): + ''' + Query ZooKeeper for unused/ready nodes. + + :param list labels: The node types we want. + + :returns: A dictionary, keyed by node type, with lists of Node objects + that are ready, or an empty dict if none are found. + ''' + ret = {} + for node_id in self.getNodes(): + node = self.getNode(node_id) + if (node and node.state == READY and + not node.allocated_to and node.type in labels + ): + if node.type not in ret: + ret[node.type] = [] + ret[node.type].append(node) + return ret From 672612deaa9fac7e85e783f37a9ce174995c991f Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Thu, 9 Feb 2017 12:18:28 -0500 Subject: [PATCH 038/309] Update nodepool 'list' command Enables the nodepool 'list' command to speak ZooKeeper. Re-enables the test_node_list test as well. Needed to individually skip failing tests in test_commands.py. Adds 'comment' attribute to the Node model since this is output by the 'list' command. Update waitForNodes() to use zookeeper syntax. Change-Id: I61a92470054985c974f3c20d5be358b399925795 Signed-off-by: Paul Belanger --- nodepool/cmd/nodepoolcmd.py | 5 +++-- nodepool/status.py | 34 +++++++++++++++++---------------- nodepool/tests/__init__.py | 15 +++++---------- nodepool/tests/test_commands.py | 16 +++++++++++++--- nodepool/tests/test_zk.py | 4 ++++ nodepool/zk.py | 6 +++++- 6 files changed, 48 insertions(+), 32 deletions(-) diff --git a/nodepool/cmd/nodepoolcmd.py b/nodepool/cmd/nodepoolcmd.py index 5ffe1918b..c92a422d5 100644 --- a/nodepool/cmd/nodepoolcmd.py +++ b/nodepool/cmd/nodepoolcmd.py @@ -159,7 +159,7 @@ class NodePoolCmd(NodepoolApp): l.setLevel(logging.WARNING) def list(self, node_id=None): - print status.node_list(self.pool.getDB(), node_id) + print status.node_list(self.zk, node_id) def dib_image_list(self): print status.dib_image_list(self.zk) @@ -354,7 +354,8 @@ class NodePoolCmd(NodepoolApp): # commands needing ZooKeeper if self.args.command in ('image-build', 'dib-image-list', 'image-list', 'dib-image-delete', - 'image-delete', 'alien-image-list'): + 'image-delete', 'alien-image-list', + 'list'): self.zk = zk.ZooKeeper() self.zk.connect(config.zookeeper_servers.values()) else: diff --git a/nodepool/status.py b/nodepool/status.py index 1283d358f..934d42398 100644 --- a/nodepool/status.py +++ b/nodepool/status.py @@ -17,8 +17,6 @@ import json import time -from nodepool import nodedb - from prettytable import PrettyTable @@ -31,21 +29,25 @@ def age(timestamp): return '%02d:%02d:%02d:%02d' % (d, h, m, s) -def node_list(db, node_id=None): - t = PrettyTable(["ID", "Provider", "AZ", "Label", "Target", - "Manager", "Hostname", "NodeName", "Server ID", - "IP", "State", "Age", "Comment"]) +def node_list(zk, node_id=None): + t = PrettyTable(["ID", "Provider", "AZ", "Label", + "Launcher", "Hostname", "Server ID", + "Public IPv4", "Private IPv4", "IPv6", + "State", "Age", "Comment"]) t.align = 'l' - with db.getSession() as session: - for node in session.getNodes(): - if node_id and node.id != node_id: - continue - t.add_row([node.id, node.provider_name, node.az, - node.label_name, node.target_name, - node.manager_name, node.hostname, - node.nodename, node.external_id, node.ip, - nodedb.STATE_NAMES[node.state], - age(node.state_time), node.comment]) + if node_id: + node = zk.getNode(node_id) + t.add_row([node.id, node.provider, node.az, node.type, + node.launcher, node.hostname, node.external_id, + node.public_ipv4, node.private_ipv4, node.public_ipv6, + node.state, age(node.state_time), node.comment]) + else: + for nid in zk.getNodes(): + node = zk.getNode(nid) + t.add_row([node.id, node.provider, node.az, node.type, + node.launcher, node.hostname, node.external_id, + node.public_ipv4, node.private_ipv4, node.public_ipv6, + node.state, age(node.state_time), node.comment]) return str(t) diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py index c8b369ea6..c0c4ef35f 100644 --- a/nodepool/tests/__init__.py +++ b/nodepool/tests/__init__.py @@ -32,7 +32,7 @@ import lockfile import kazoo.client import testtools -from nodepool import allocation, builder, fakeprovider, nodepool, nodedb, webapp +from nodepool import builder, fakeprovider, nodepool, webapp from nodepool import zk from nodepool.cmd.config_validator import ConfigValidator @@ -420,17 +420,12 @@ class DBTestCase(BaseTestCase): self.wait_for_threads() - def waitForNodes(self, pool): - self.wait_for_config(pool) - allocation_history = allocation.AllocationHistory() + def waitForNodes(self, label): while True: self.wait_for_threads() - with pool.getDB().getSession() as session: - needed = pool.getNeededNodes(session, allocation_history) - if not needed: - nodes = session.getNodes(state=nodedb.BUILDING) - if not nodes: - break + ready_nodes = self.zk.getReadyNodesOfTypes([label]) + if ready_nodes: + break time.sleep(1) self.wait_for_threads() diff --git a/nodepool/tests/test_commands.py b/nodepool/tests/test_commands.py index 2100d71ae..43f5a1a3a 100644 --- a/nodepool/tests/test_commands.py +++ b/nodepool/tests/test_commands.py @@ -21,6 +21,8 @@ import fixtures import mock import testtools +from unittest import skip + from nodepool.cmd import nodepoolcmd from nodepool import tests from nodepool import zk @@ -28,8 +30,7 @@ from nodepool import zk class TestNodepoolCMD(tests.DBTestCase): def setUp(self): - super(tests.DBTestCase, self).setUp() - self.skipTest("Disabled for early v3 development") + super(TestNodepoolCMD, self).setUp() def patch_argv(self, *args): argv = ["nodepool", "-s", self.secure_conf] @@ -88,6 +89,7 @@ class TestNodepoolCMD(tests.DBTestCase): self.waitForUploadRecordDeletion('fake-provider', 'fake-image', image.build_id, image.id) + @skip("Disabled for early v3 development") def test_alien_list_fail(self): def fail_list(self): raise RuntimeError('Fake list error') @@ -124,7 +126,7 @@ class TestNodepoolCMD(tests.DBTestCase): pool = self.useNodepool(configfile, watermark_sleep=1) pool.start() self.waitForImage('fake-provider', 'fake-image') - self.waitForNodes(pool) + self.waitForNodes('fake-label') self.assert_nodes_listed(configfile, 1) def test_config_validate(self): @@ -147,6 +149,7 @@ class TestNodepoolCMD(tests.DBTestCase): nodepoolcmd.main() self.assert_listed(configfile, ['dib-image-list'], 1, 'fake-image', 0) + @skip("Disabled for early v3 development") def test_dib_image_pause(self): configfile = self.setup_config('node_diskimage_pause.yaml') self._useBuilder(configfile) @@ -156,6 +159,7 @@ class TestNodepoolCMD(tests.DBTestCase): self.assert_listed(configfile, ['dib-image-list'], 1, 'fake-image', 0) self.assert_listed(configfile, ['dib-image-list'], 1, 'fake-image2', 1) + @skip("Disabled for early v3 development") def test_dib_image_upload_pause(self): configfile = self.setup_config('node_image_upload_pause.yaml') self._useBuilder(configfile) @@ -169,6 +173,7 @@ class TestNodepoolCMD(tests.DBTestCase): self.assert_listed(configfile, ['image-list'], 3, 'fake-image', 0) self.assert_listed(configfile, ['image-list'], 3, 'fake-image2', 1) + @skip("Disabled for early v3 development") def test_dib_image_delete(self): configfile = self.setup_config('node.yaml') pool = self.useNodepool(configfile, watermark_sleep=1) @@ -188,6 +193,7 @@ class TestNodepoolCMD(tests.DBTestCase): self.assert_listed( configfile, ['dib-image-list'], 0, 'fake-image-0000000001', 0) + @skip("Disabled for early v3 development") def test_hold(self): configfile = self.setup_config('node.yaml') pool = self.useNodepool(configfile, watermark_sleep=1) @@ -205,6 +211,7 @@ class TestNodepoolCMD(tests.DBTestCase): self.assert_listed(configfile, ['list'], 0, 1, 1) self.assert_nodes_listed(configfile, 1, 'hold') + @skip("Disabled for early v3 development") def test_delete(self): configfile = self.setup_config('node.yaml') pool = self.useNodepool(configfile, watermark_sleep=1) @@ -218,6 +225,7 @@ class TestNodepoolCMD(tests.DBTestCase): # Delete node 1 self.assert_listed(configfile, ['delete', '1'], 10, 'delete', 1) + @skip("Disabled for early v3 development") def test_delete_now(self): configfile = self.setup_config('node.yaml') pool = self.useNodepool(configfile, watermark_sleep=1) @@ -250,6 +258,7 @@ class TestNodepoolCMD(tests.DBTestCase): self.waitForImage('fake-provider', 'fake-image', [image]) self.assert_listed(configfile, ['dib-image-list'], 4, zk.READY, 2) + @skip("Disabled for early v3 development") def test_job_create(self): configfile = self.setup_config('node.yaml') self.patch_argv("-c", configfile, "job-create", "fake-job", @@ -257,6 +266,7 @@ class TestNodepoolCMD(tests.DBTestCase): nodepoolcmd.main() self.assert_listed(configfile, ['job-list'], 2, 1, 1) + @skip("Disabled for early v3 development") def test_job_delete(self): configfile = self.setup_config('node.yaml') self.patch_argv("-c", configfile, "job-create", "fake-job", diff --git a/nodepool/tests/test_zk.py b/nodepool/tests/test_zk.py index edecf2c5d..2ff2205a0 100644 --- a/nodepool/tests/test_zk.py +++ b/nodepool/tests/test_zk.py @@ -708,6 +708,7 @@ class TestZKModel(tests.BaseTestCase): o.launcher = 'launcher-id' o.external_id = 'ABCD' o.hostname = 'xyz' + o.comment = 'comment' d = o.toDict() self.assertNotIn('id', d) @@ -725,6 +726,7 @@ class TestZKModel(tests.BaseTestCase): self.assertEqual(d['launcher'], o.launcher) self.assertEqual(d['external_id'], o.external_id) self.assertEqual(d['hostname'], o.hostname) + self.assertEqual(d['comment'], o.comment) def test_Node_fromDict(self): now = int(time.time()) @@ -744,6 +746,7 @@ class TestZKModel(tests.BaseTestCase): 'launcher': 'launcher-id', 'external_id': 'ABCD', 'hostname': 'xyz', + 'comment': 'comment', } o = zk.Node.fromDict(d, node_id) @@ -762,3 +765,4 @@ class TestZKModel(tests.BaseTestCase): self.assertEqual(o.launcher, d['launcher']) self.assertEqual(o.external_id, d['external_id']) self.assertEqual(o.hostname , d['hostname']) + self.assertEqual(o.comment , d['comment']) diff --git a/nodepool/zk.py b/nodepool/zk.py index f71fcd1a6..93b6fc921 100644 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -386,6 +386,7 @@ class Node(BaseModel): self.created_time = None self.external_id = None self.hostname = None + self.comment = None def __repr__(self): d = self.toDict() @@ -409,7 +410,8 @@ class Node(BaseModel): self.launcher == other.launcher and self.created_time == other.created_time and self.external_id == other.external_id and - self.hostname == other.hostname) + self.hostname == other.hostname, + self.comment == other.comment) else: return False @@ -430,6 +432,7 @@ class Node(BaseModel): d['created_time'] = self.created_time d['external_id'] = self.external_id d['hostname'] = self.hostname + d['comment'] = self.comment return d @staticmethod @@ -456,6 +459,7 @@ class Node(BaseModel): o.created_time = d.get('created_time') o.external_id = d.get('external_id') o.hostname = d.get('hostname') + o.comment = d.get('comment') return o From 5592d6a2b3ab097db73d7b52cff7ee17636da53d Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Thu, 9 Feb 2017 17:08:03 -0500 Subject: [PATCH 039/309] Update nodepool hold to use zookeeper Replace database calls with new zookeeper APIs. Also enable our testing. Change-Id: Id6c07ed5b61e54afadc1b2787f4c8f4744ce33ff Signed-off-by: Paul Belanger --- nodepool/cmd/nodepoolcmd.py | 21 ++++++++++----------- nodepool/tests/__init__.py | 2 ++ nodepool/tests/test_commands.py | 13 +++++++------ 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/nodepool/cmd/nodepoolcmd.py b/nodepool/cmd/nodepoolcmd.py index c92a422d5..375d47e69 100644 --- a/nodepool/cmd/nodepoolcmd.py +++ b/nodepool/cmd/nodepoolcmd.py @@ -89,8 +89,8 @@ class NodePoolCmd(NodepoolApp): help='place a node in the HOLD state') cmd_hold.set_defaults(func=self.hold) cmd_hold.add_argument('id', help='node id') - cmd_hold.add_argument('--reason', - help='Optional reason this node is held') + cmd_hold.add_argument('--reason', help='Reason this node is held', + required=True) cmd_delete = subparsers.add_parser( 'delete', @@ -252,14 +252,13 @@ class NodePoolCmd(NodepoolApp): print t def hold(self): - node_id = None - with self.pool.getDB().getSession() as session: - node = session.getNode(self.args.id) - node.state = nodedb.HOLD - if self.args.reason: - node.comment = self.args.reason - node_id = node.id - self.list(node_id=node_id) + node = self.zk.getNode(self.args.id) + node.state = zk.HOLD + node.comment = self.args.reason + self.zk.lockNode(node, blocking=False) + self.zk.storeNode(node) + self.zk.unlockNode(node) + self.list(node_id=self.args.id) def delete(self): if self.args.now: @@ -355,7 +354,7 @@ class NodePoolCmd(NodepoolApp): if self.args.command in ('image-build', 'dib-image-list', 'image-list', 'dib-image-delete', 'image-delete', 'alien-image-list', - 'list'): + 'list', 'hold'): self.zk = zk.ZooKeeper() self.zk.connect(config.zookeeper_servers.values()) else: diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py index c0c4ef35f..191d5cfb6 100644 --- a/nodepool/tests/__init__.py +++ b/nodepool/tests/__init__.py @@ -428,6 +428,7 @@ class DBTestCase(BaseTestCase): break time.sleep(1) self.wait_for_threads() + return ready_nodes[label] def waitForNodeRequest(self, req): ''' @@ -438,6 +439,7 @@ class DBTestCase(BaseTestCase): if req.state in (zk.FULFILLED, zk.FAILED): break time.sleep(1) + return req def useNodepool(self, *args, **kwargs): diff --git a/nodepool/tests/test_commands.py b/nodepool/tests/test_commands.py index 43f5a1a3a..fcaf2dc6c 100644 --- a/nodepool/tests/test_commands.py +++ b/nodepool/tests/test_commands.py @@ -193,22 +193,23 @@ class TestNodepoolCMD(tests.DBTestCase): self.assert_listed( configfile, ['dib-image-list'], 0, 'fake-image-0000000001', 0) - @skip("Disabled for early v3 development") def test_hold(self): configfile = self.setup_config('node.yaml') pool = self.useNodepool(configfile, watermark_sleep=1) self._useBuilder(configfile) pool.start() self.waitForImage('fake-provider', 'fake-image') - self.waitForNodes(pool) + nodes = self.waitForNodes('fake-label') + node_id = nodes[0].id # Assert one node exists and it is node 1 in a ready state. - self.assert_listed(configfile, ['list'], 0, 1, 1) + self.assert_listed(configfile, ['list'], 0, node_id, 1) self.assert_nodes_listed(configfile, 1, zk.READY) - # Hold node 1 - self.patch_argv('-c', configfile, 'hold', '1') + # Hold node 0000000000 + self.patch_argv( + '-c', configfile, 'hold', node_id, '--reason', 'testing') nodepoolcmd.main() # Assert the state changed to HOLD - self.assert_listed(configfile, ['list'], 0, 1, 1) + self.assert_listed(configfile, ['list'], 0, node_id, 1) self.assert_nodes_listed(configfile, 1, 'hold') @skip("Disabled for early v3 development") From 5e6cf751227855934f4c2c489219feb5486d1704 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Fri, 10 Feb 2017 15:32:26 -0500 Subject: [PATCH 040/309] Re-enable test_dib_image_pause / test_dib_image_upload_pause Update tests to use new waitForNodes() syntax. Change-Id: I125c48d9d7b1dbaf98f3b79f30a4fd7aea83b355 Signed-off-by: Paul Belanger --- nodepool/tests/test_commands.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nodepool/tests/test_commands.py b/nodepool/tests/test_commands.py index fcaf2dc6c..cdeb2476f 100644 --- a/nodepool/tests/test_commands.py +++ b/nodepool/tests/test_commands.py @@ -149,23 +149,23 @@ class TestNodepoolCMD(tests.DBTestCase): nodepoolcmd.main() self.assert_listed(configfile, ['dib-image-list'], 1, 'fake-image', 0) - @skip("Disabled for early v3 development") def test_dib_image_pause(self): configfile = self.setup_config('node_diskimage_pause.yaml') self._useBuilder(configfile) pool = self.useNodepool(configfile, watermark_sleep=1) pool.start() - self.waitForNodes(pool) + nodes = self.waitForNodes('fake-label2') + self.assertEqual(len(nodes), 1) self.assert_listed(configfile, ['dib-image-list'], 1, 'fake-image', 0) self.assert_listed(configfile, ['dib-image-list'], 1, 'fake-image2', 1) - @skip("Disabled for early v3 development") def test_dib_image_upload_pause(self): configfile = self.setup_config('node_image_upload_pause.yaml') self._useBuilder(configfile) pool = self.useNodepool(configfile, watermark_sleep=1) pool.start() - self.waitForNodes(pool) + nodes = self.waitForNodes('fake-label2') + self.assertEqual(len(nodes), 1) # Make sure diskimages were built. self.assert_listed(configfile, ['dib-image-list'], 1, 'fake-image', 1) self.assert_listed(configfile, ['dib-image-list'], 1, 'fake-image2', 1) From 06a553918af33423f1ba9b2fb9746954de8ee426 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Sat, 11 Feb 2017 11:24:42 -0500 Subject: [PATCH 041/309] Re-enable test_dib_image_delete test Update test to use the new zookeeper syntax for waitForNodes(). Change-Id: I04780bf3e365693b8b4acfc1cf3af7dd335fe539 Signed-off-by: Paul Belanger --- nodepool/tests/test_commands.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nodepool/tests/test_commands.py b/nodepool/tests/test_commands.py index cdeb2476f..b87014b10 100644 --- a/nodepool/tests/test_commands.py +++ b/nodepool/tests/test_commands.py @@ -173,14 +173,14 @@ class TestNodepoolCMD(tests.DBTestCase): self.assert_listed(configfile, ['image-list'], 3, 'fake-image', 0) self.assert_listed(configfile, ['image-list'], 3, 'fake-image2', 1) - @skip("Disabled for early v3 development") def test_dib_image_delete(self): configfile = self.setup_config('node.yaml') pool = self.useNodepool(configfile, watermark_sleep=1) self._useBuilder(configfile) pool.start() self.waitForImage('fake-provider', 'fake-image') - self.waitForNodes(pool) + nodes = self.waitForNodes('fake-label') + self.assertEqual(len(nodes), 1) # Check the image exists self.assert_listed(configfile, ['dib-image-list'], 4, zk.READY, 1) builds = self.zk.getMostRecentBuilds(1, 'fake-image', zk.READY) From c4112ca2a4fdab310590dd740e69e5c7bea04d3c Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Sat, 11 Feb 2017 11:30:59 -0500 Subject: [PATCH 042/309] Re-enable test_node test Replace database API with zookeeper. Change-Id: Iba117d130ac02df122b2fd7fdee03171f154577d Signed-off-by: Paul Belanger --- nodepool/tests/test_nodepool.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index 8feecfd72..8e1651eee 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -101,7 +101,6 @@ class TestNodepool(tests.DBTestCase): self.assertEqual(req.state, zk.FAILED) self.assertNotEqual(req.declined_by, []) - @skip("Disabled for early v3 development") def test_node(self): """Test that an image and node are created""" configfile = self.setup_config('node.yaml') @@ -109,14 +108,12 @@ class TestNodepool(tests.DBTestCase): self._useBuilder(configfile) pool.start() self.waitForImage('fake-provider', 'fake-image') - self.waitForNodes(pool) + nodes = self.waitForNodes('fake-label') + + self.assertEqual(len(nodes), 1) + self.assertEqual(nodes[0].provider, 'fake-provider') + self.assertEqual(nodes[0].type, 'fake-label') - with pool.getDB().getSession() as session: - nodes = session.getNodes(provider_name='fake-provider', - label_name='fake-label', - target_name='fake-target', - state=nodedb.READY) - self.assertEqual(len(nodes), 1) @skip("Disabled for early v3 development") def test_disabled_label(self): From 382295dcc085d4f9203acabdbf5af3246ed3d436 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Mon, 13 Feb 2017 10:13:52 -0500 Subject: [PATCH 043/309] Split up min-ready requests to 1 node per request This does two things: - Spreads the min-ready nodes across providers. - Solves the situation where a single provider may not be able to fulfill the min-ready request because of quota issues or configuration limitations. For example, min-ready=2 but max-servers=1. We need to be able to force new node launches to satisfy min-ready, so a new 'reuse' attribute is added to the NodeRequest model. Enables the test_node_vhd_and_qcow2 test which is an example of min-ready=2/max-servers=1 across two providers. Change-Id: Id32318082035416be6de4b8fcec3709f4ade03a0 --- nodepool/nodepool.py | 63 +++++++++++++++++++++------------ nodepool/tests/__init__.py | 4 +-- nodepool/tests/test_nodepool.py | 24 +++++-------- nodepool/tests/test_zk.py | 4 +++ nodepool/zk.py | 6 +++- 5 files changed, 61 insertions(+), 40 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 42fd443db..fce61a98c 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -1057,7 +1057,7 @@ class NodeRequestHandler(object): for ntype in self.request.node_types: # First try to grab from the list of already available nodes. got_a_node = False - if ntype in ready_nodes: + if self.request.reuse and ntype in ready_nodes: for node in ready_nodes[ntype]: try: self.zk.lockNode(node, blocking=False) @@ -1662,26 +1662,37 @@ class NodePool(threading.Thread): to our request, since we are deleting the request. ''' for label in self._submittedRequests.keys(): - req = self._submittedRequests[label] - self._submittedRequests[label] = self.zk.getNodeRequest(req.id) + label_requests = self._submittedRequests[label] + active_requests = [] - if self._submittedRequests[label]: - if self._submittedRequests[label].state == zk.FULFILLED: - self.log.debug("min-ready node request for %s fulfilled", label) + for req in label_requests: + req = self.zk.getNodeRequest(req.id) + + if not req: + continue + + if req.state == zk.FULFILLED: # Reset node allocated_to - for node_id in self._submittedRequests[label].nodes: + for node_id in req.nodes: node = self.zk.getNode(node_id) node.allocated_to = None - # NOTE: locking shouldn't be necessary since a node with - # allocated_to set should not be locked except by the - # creator of the request (us). + # NOTE: locking shouldn't be necessary since a node + # with allocated_to set should not be locked except + # by the creator of the request (us). self.zk.storeNode(node) - self.zk.deleteNodeRequest(self._submittedRequests[label]) - del self._submittedRequests[label] - elif self._submittedRequests[label].state == zk.FAILED: - self.log.debug("min-ready node request for %s failed", label) - self.zk.deleteNodeRequest(self._submittedRequests[label]) - del self._submittedRequests[label] + self.zk.deleteNodeRequest(req) + elif req.state == zk.FAILED: + self.log.debug("min-ready node request failed: %s", req) + self.zk.deleteNodeRequest(req) + else: + active_requests.append(req) + + if active_requests: + self._submittedRequests[label] = active_requests + else: + self.log.debug( + "No more active min-ready requests for label %s", label) + del self._submittedRequests[label] def createMinReady(self): ''' @@ -1692,13 +1703,15 @@ class NodePool(threading.Thread): Requests we've already submitted are stored in the _submittedRequests dict, keyed by label. ''' - def createRequest(label_name, count): + def createRequest(label_name): req = zk.NodeRequest() req.state = zk.REQUESTED - for i in range(0, count): - req.node_types.append(label_name) + req.node_types.append(label_name) + req.reuse = False # force new node launches self.zk.storeNodeRequest(req) - self._submittedRequests[label_name] = req + if label_name not in self._submittedRequests: + self._submittedRequests[label_name] = [] + self._submittedRequests[label_name].append(req) # Since we could have already submitted node requests, do not # resubmit a request for a type if a request for that type is @@ -1725,8 +1738,14 @@ class NodePool(threading.Thread): need = min_ready - len(ready_nodes[label.name]) if need: - self.log.info("Creating request for %d %s nodes", need, label.name) - createRequest(label.name, need) + # Create requests for 1 node at a time. This helps to split + # up requests across providers, and avoids scenario where a + # single provider might fail the entire request because of + # quota (e.g., min-ready=2, but max-servers=1). + self.log.info("Creating requests for %d %s nodes", + need, label.name) + for i in range(0, need): + createRequest(label.name) def run(self): ''' diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py index 191d5cfb6..18ff602c8 100644 --- a/nodepool/tests/__init__.py +++ b/nodepool/tests/__init__.py @@ -420,11 +420,11 @@ class DBTestCase(BaseTestCase): self.wait_for_threads() - def waitForNodes(self, label): + def waitForNodes(self, label, count=1): while True: self.wait_for_threads() ready_nodes = self.zk.getReadyNodesOfTypes([label]) - if ready_nodes: + if label in ready_nodes and len(ready_nodes[label]) == count: break time.sleep(1) self.wait_for_threads() diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index 8e1651eee..b0b6db1aa 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -166,28 +166,22 @@ class TestNodepool(tests.DBTestCase): state=nodedb.READY) self.assertEqual(len(nodes), 1) - @skip("Disabled for early v3 development") def test_node_vhd_and_qcow2(self): """Test label provided by vhd and qcow2 images builds""" configfile = self.setup_config('node_vhd_and_qcow2.yaml') pool = self.useNodepool(configfile, watermark_sleep=1) self._useBuilder(configfile) - pool.start() self.waitForImage('fake-provider1', 'fake-image') self.waitForImage('fake-provider2', 'fake-image') - self.waitForNodes(pool) - - with pool.getDB().getSession() as session: - nodes = session.getNodes(provider_name='fake-provider1', - label_name='fake-label', - target_name='fake-target', - state=nodedb.READY) - self.assertEqual(len(nodes), 1) - nodes = session.getNodes(provider_name='fake-provider2', - label_name='fake-label', - target_name='fake-target', - state=nodedb.READY) - self.assertEqual(len(nodes), 1) + pool.start() + nodes = self.waitForNodes('fake-label', 2) + self.assertEqual(len(nodes), 2) + self.assertEqual(zk.READY, nodes[0].state) + self.assertEqual(zk.READY, nodes[1].state) + if nodes[0].provider == 'fake-provider1': + self.assertEqual(nodes[1].provider, 'fake-provider2') + else: + self.assertEqual(nodes[1].provider, 'fake-provider1') @skip("Disabled for early v3 development") def test_dib_upload_fail(self): diff --git a/nodepool/tests/test_zk.py b/nodepool/tests/test_zk.py index 2ff2205a0..614ad0406 100644 --- a/nodepool/tests/test_zk.py +++ b/nodepool/tests/test_zk.py @@ -667,6 +667,7 @@ class TestZKModel(tests.BaseTestCase): o.declined_by.append("abc") o.node_types.append('trusty') o.nodes.append('100') + o.reuse = False d = o.toDict() self.assertNotIn('id', d) self.assertIn('state', d) @@ -674,6 +675,7 @@ class TestZKModel(tests.BaseTestCase): self.assertEqual(d['declined_by'], o.declined_by) self.assertEqual(d['node_types'], o.node_types) self.assertEqual(d['nodes'], o.nodes) + self.assertEqual(d['reuse'], o.reuse) def test_NodeRequest_fromDict(self): now = int(time.time()) @@ -684,6 +686,7 @@ class TestZKModel(tests.BaseTestCase): 'declined_by': ['abc'], 'node_types': ['trusty'], 'nodes': ['100'], + 'reuse': False, } o = zk.NodeRequest.fromDict(d, req_id) @@ -693,6 +696,7 @@ class TestZKModel(tests.BaseTestCase): self.assertEqual(o.declined_by, d['declined_by']) self.assertEqual(o.node_types, d['node_types']) self.assertEqual(o.nodes, d['nodes']) + self.assertEqual(o.reuse, d['reuse']) def test_Node_toDict(self): o = zk.Node('123') diff --git a/nodepool/zk.py b/nodepool/zk.py index 93b6fc921..f9b8eb9e3 100644 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -320,6 +320,7 @@ class NodeRequest(BaseModel): self.declined_by = [] self.node_types = [] self.nodes = [] + self.reuse = True def __repr__(self): d = self.toDict() @@ -332,7 +333,8 @@ class NodeRequest(BaseModel): return (self.id == other.id and self.declined_by == other.declined_by and self.node_types == other.node_types and - self.nodes == other.nodes) + self.nodes == other.nodes, + self.reuse == other.reuse) else: return False @@ -344,6 +346,7 @@ class NodeRequest(BaseModel): d['declined_by'] = self.declined_by d['node_types'] = self.node_types d['nodes'] = self.nodes + d['reuse'] = self.reuse return d @staticmethod @@ -361,6 +364,7 @@ class NodeRequest(BaseModel): o.declined_by = d.get('declined_by', []) o.node_types = d.get('node_types', []) o.nodes = d.get('nodes', []) + o.reuse = d.get('reuse', True) return o From a49d722956dd3bc86dfd63c101de1750167914f1 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Mon, 13 Feb 2017 09:39:26 -0800 Subject: [PATCH 044/309] Use helper function to instantiate ProviderManager This will create a fake provider manager if necessary. Change-Id: I644929ed35c2eb91d0eb0ee9fd32dd814e0e91b0 Depends-On: I35982d0d03ae00af77a515839b53542c5d830c89 --- nodepool/nodepool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index fce61a98c..c9d4816e3 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -1213,7 +1213,7 @@ class ProviderWorker(threading.Thread): if not self.manager: self.log.debug("Creating new ProviderManager") - self.manager = provider_manager.ProviderManager( + self.manager = provider_manager.get_provider_manager( self.provider, use_taskmanager=True) self.manager.start() From ed1201ccf4d74b40591d095868d8b12686275fcc Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Mon, 13 Feb 2017 10:41:23 -0800 Subject: [PATCH 045/309] Wait for main loop when stopping When stopping the nodepool launcher, wait for the main loop to exit. This prevents race conditions (especially in tests) where it might continue to run for a bit while stopping. Change-Id: I3f25fd40948c45b14944468abcbceee4151097fb --- nodepool/nodepool.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 42fd443db..754568d33 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -1368,6 +1368,7 @@ class NodePool(threading.Thread): self.log.debug("Waiting for %s" % thd.name) thd.join() + self.join() self.log.debug("Finished stopping") def loadConfig(self): From 5b3ca0d3edfd4d8aac00a09369f3b45c79a42932 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Mon, 13 Feb 2017 13:40:31 -0500 Subject: [PATCH 046/309] Re-enable test_node_net_name test Update the test to use new zookeeper syntax. Change-Id: I467569fbccc47d22415c25b9e197704632bb538b Signed-off-by: Paul Belanger --- nodepool/tests/test_nodepool.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index b0b6db1aa..34fe93fd0 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -132,7 +132,6 @@ class TestNodepool(tests.DBTestCase): state=nodedb.READY) self.assertEqual(len(nodes), 0) - @skip("Disabled for early v3 development") def test_node_net_name(self): """Test that a node is created with a net name""" configfile = self.setup_config('node_net_name.yaml') @@ -140,14 +139,10 @@ class TestNodepool(tests.DBTestCase): self._useBuilder(configfile) pool.start() self.waitForImage('fake-provider', 'fake-image') - self.waitForNodes(pool) - - with pool.getDB().getSession() as session: - nodes = session.getNodes(provider_name='fake-provider', - label_name='fake-label', - target_name='fake-target', - state=nodedb.READY) - self.assertEqual(len(nodes), 1) + nodes = self.waitForNodes('fake-label') + self.assertEqual(len(nodes), 1) + self.assertEqual(nodes[0].provider, 'fake-provider') + self.assertEqual(nodes[0].type, 'fake-label') @skip("Disabled for early v3 development") def test_node_vhd_image(self): From e0f14f364914fb646709afc83ac6f0ff5b2c9e11 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Mon, 13 Feb 2017 13:58:20 -0500 Subject: [PATCH 047/309] Re-enable test_node_vhd_image test Update syntax for zookeeper. Change-Id: I867822e4fce63a781dd0b634ea5699baea4ba38c Signed-off-by: Paul Belanger --- nodepool/tests/test_nodepool.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index 34fe93fd0..282645b73 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -144,7 +144,6 @@ class TestNodepool(tests.DBTestCase): self.assertEqual(nodes[0].provider, 'fake-provider') self.assertEqual(nodes[0].type, 'fake-label') - @skip("Disabled for early v3 development") def test_node_vhd_image(self): """Test that a image and node are created vhd image""" configfile = self.setup_config('node_vhd.yaml') @@ -152,14 +151,10 @@ class TestNodepool(tests.DBTestCase): self._useBuilder(configfile) pool.start() self.waitForImage('fake-provider', 'fake-image') - self.waitForNodes(pool) - - with pool.getDB().getSession() as session: - nodes = session.getNodes(provider_name='fake-provider', - label_name='fake-label', - target_name='fake-target', - state=nodedb.READY) + nodes = self.waitForNodes('fake-label') self.assertEqual(len(nodes), 1) + self.assertEqual(nodes[0].provider, 'fake-provider') + self.assertEqual(nodes[0].type, 'fake-label') def test_node_vhd_and_qcow2(self): """Test label provided by vhd and qcow2 images builds""" From ca228357b1866e680fcab531cbb25b5d592c493c Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Mon, 13 Feb 2017 14:48:13 -0500 Subject: [PATCH 048/309] Set Node image_id and launcher attributes Change-Id: I4b9fa85b29117b9dae82024c2a83d248aeb37fe6 --- nodepool/nodepool.py | 6 ++++++ nodepool/tests/test_nodepool.py | 10 ++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 0a624b176..21673740e 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -725,6 +725,11 @@ class NodeLauncher(threading.Thread): self._node.external_id = server.id self._node.hostname = hostname + self._node.image_id = "{path}/{upload_id}".format( + path=self._zk._imageUploadPath(cloud_image.image_name, + cloud_image.build_id, + cloud_image.provider_name), + upload_id=cloud_image.id) # Checkpoint save the updated node info self._zk.storeNode(self._node) @@ -1077,6 +1082,7 @@ class NodeRequestHandler(object): node.state = zk.INIT node.type = ntype node.provider = self.provider.name + node.launcher = self.launcher_id node.allocated_to = self.request.id # Note: It should be safe (i.e., no race) to lock the node diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index b0b6db1aa..6d4400cee 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -38,7 +38,7 @@ class TestNodepool(tests.DBTestCase): ''' configfile = self.setup_config('node.yaml') self._useBuilder(configfile) - self.waitForImage('fake-provider', 'fake-image') + image = self.waitForImage('fake-provider', 'fake-image') pool = self.useNodepool(configfile, watermark_sleep=1) pool.start() @@ -56,10 +56,16 @@ class TestNodepool(tests.DBTestCase): node = self.zk.getNode(node_id) self.assertEqual(node.allocated_to, req.id) self.assertEqual(node.state, zk.READY) + self.assertIsNotNone(node.launcher) + p = "{path}/{id}".format( + path=self.zk._imageUploadPath(image.image_name, + image.build_id, + image.provider_name), + id=image.id) + self.assertEqual(node.image_id, p) self.zk.lockNode(node, blocking=False) self.zk.unlockNode(node) - @mock.patch('nodepool.nodepool.NodeLauncher._launchNode') def test_fail_request_on_launch_failure(self, mock_launch): ''' From 5e18fbee2c784de2c9294f76e603cadf995bca06 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Mon, 13 Feb 2017 15:32:19 -0500 Subject: [PATCH 049/309] Add generator API method for node iteration This is a common enough pattern that we should simplify it. Change-Id: I649c691cf09087cf7d46fa1e9c6c879e4d60247b --- nodepool/nodepool.py | 5 ++--- nodepool/tests/test_zk.py | 7 +++++++ nodepool/zk.py | 14 +++++++++++--- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 21673740e..baf7d2cf9 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -999,9 +999,8 @@ class NodeRequestHandler(object): :returns: An integer for the number launched for this provider. ''' count = 0 - for node_id in self.zk.getNodes(): - node = self.zk.getNode(node_id) - if node and node.provider == self.provider.name: + for node in self.zk.nodeIterator(): + if node.provider == self.provider.name: count += 1 return count diff --git a/nodepool/tests/test_zk.py b/nodepool/tests/test_zk.py index 614ad0406..1aad83ef3 100644 --- a/nodepool/tests/test_zk.py +++ b/nodepool/tests/test_zk.py @@ -571,6 +571,13 @@ class TestZooKeeper(tests.DBTestCase): self.assertEqual(1, len(r['label1'])) self.assertEqual(n2, r['label1'][0]) + def test_nodeIterator(self): + n1 = self._create_node() + i = self.zk.nodeIterator() + self.assertEqual(n1, i.next()) + with testtools.ExpectedException(StopIteration): + i.next() + class TestZKModel(tests.BaseTestCase): diff --git a/nodepool/zk.py b/nodepool/zk.py index f9b8eb9e3..eac6c451f 100644 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -1463,12 +1463,20 @@ class ZooKeeper(object): that are ready, or an empty dict if none are found. ''' ret = {} - for node_id in self.getNodes(): - node = self.getNode(node_id) - if (node and node.state == READY and + for node in self.nodeIterator(): + if (node.state == READY and not node.allocated_to and node.type in labels ): if node.type not in ret: ret[node.type] = [] ret[node.type].append(node) return ret + + def nodeIterator(self): + ''' + Utility generator method for iterating through all nodes. + ''' + for node_id in self.getNodes(): + node = self.getNode(node_id) + if node: + yield node From 218e358dcfd35a76d7cfed191f6c49d2b9513e09 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Tue, 14 Feb 2017 15:17:35 -0500 Subject: [PATCH 050/309] Disconnect from ZooKeeper at shutdown Here I sit all broken hearted Wondering why, with zk started, We did not try to disconnect, When a stop command, we did get. I blame the programmer who let it be, Oh, wait! Just realized, that programmer is me. Change-Id: I542858a10182719f1f4bdfc566d327eb2790f53d --- nodepool/nodepool.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index baf7d2cf9..b134c5646 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -1374,6 +1374,7 @@ class NodePool(threading.Thread): thd.join() self.join() + self.zk.disconnect() self.log.debug("Finished stopping") def loadConfig(self): From da774f869689a88eeb61ff907887eaba413aa682 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Mon, 13 Feb 2017 14:08:43 -0500 Subject: [PATCH 051/309] Re-enable test_dib_upload_fail test Update test to use new zookeeper syntax. Change-Id: Ida35846f2ab5e3339ec03bdf27d72d68c2f3ad55 Signed-off-by: Paul Belanger --- nodepool/tests/test_nodepool.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index ac8786a64..590fe1067 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -179,7 +179,6 @@ class TestNodepool(tests.DBTestCase): else: self.assertEqual(nodes[1].provider, 'fake-provider1') - @skip("Disabled for early v3 development") def test_dib_upload_fail(self): """Test that an image upload failure is contained.""" configfile = self.setup_config('node_upload_fail.yaml') @@ -187,19 +186,14 @@ class TestNodepool(tests.DBTestCase): self._useBuilder(configfile) pool.start() self.waitForImage('fake-provider2', 'fake-image') - self.waitForNodes(pool) - - with pool.getDB().getSession() as session: - nodes = session.getNodes(provider_name='fake-provider1', - label_name='fake-label', - target_name='fake-target', - state=nodedb.READY) - self.assertEqual(len(nodes), 0) - nodes = session.getNodes(provider_name='fake-provider2', - label_name='fake-label', - target_name='fake-target', - state=nodedb.READY) - self.assertEqual(len(nodes), 2) + nodes = self.waitForNodes('fake-label', 2) + self.assertEqual(len(nodes), 2) + total_nodes = sum(1 for _ in self.zk.nodeIterator()) + self.assertEqual(total_nodes, 2) + self.assertEqual(nodes[0].provider, 'fake-provider2') + self.assertEqual(nodes[0].type, 'fake-label') + self.assertEqual(nodes[1].provider, 'fake-provider2') + self.assertEqual(nodes[1].type, 'fake-label') @skip("Disabled for early v3 development") def test_node_az(self): From 93b9b54883e82c6fd243615a9b56343354897ef7 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Mon, 13 Feb 2017 15:55:53 -0500 Subject: [PATCH 052/309] Re-enable working test_builder.py tests But leave the current failing tests disabled. These will be cleaned up in a follow up patch. Change-Id: Iea7d56ceb14684c77cb991d362bff992ca061590 Signed-off-by: Paul Belanger --- nodepool/tests/test_builder.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nodepool/tests/test_builder.py b/nodepool/tests/test_builder.py index 0ce7a69b6..a02e3840b 100644 --- a/nodepool/tests/test_builder.py +++ b/nodepool/tests/test_builder.py @@ -15,6 +15,7 @@ import os import fixtures +from unittest import skip from nodepool import builder, exceptions, fakeprovider, tests from nodepool import zk @@ -85,9 +86,6 @@ class TestNodepoolBuilderDibImage(tests.BaseTestCase): self.assertRaises(exceptions.BuilderError, image.to_path, '/imagedir/') class TestNodePoolBuilder(tests.DBTestCase): - def setUp(self): - super(tests.DBTestCase, self).setUp() - self.skipTest("Disabled for early v3 development") def test_start_stop(self): config = self.setup_config('node.yaml') @@ -98,6 +96,7 @@ class TestNodePoolBuilder(tests.DBTestCase): nb.start() nb.stop() + @skip("Disabled for early v3 development") def test_image_upload_fail(self): """Test that image upload fails are handled properly.""" @@ -279,6 +278,7 @@ class TestNodePoolBuilder(tests.DBTestCase): # Make sure our cleanup worker properly removes the first build. self.waitForBuildDeletion('fake-image', '0000000001') + @skip("Disabled for early v3 development") def test_diskimage_build_only(self): configfile = self.setup_config('node_diskimage_only.yaml') self._useBuilder(configfile) From ba4eafa5a875bccf1ea7c454e9740d53f534f7bd Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Mon, 13 Feb 2017 15:59:36 -0500 Subject: [PATCH 053/309] Re-enable test_diskimage_build_only test Remove gearman and zmq settings from yaml file. Change-Id: I8f0938ce132af89b1c7fcd2e9b1d217f23d4df0f Signed-off-by: Paul Belanger --- nodepool/tests/fixtures/node_diskimage_only.yaml | 7 ------- nodepool/tests/test_builder.py | 1 - 2 files changed, 8 deletions(-) diff --git a/nodepool/tests/fixtures/node_diskimage_only.yaml b/nodepool/tests/fixtures/node_diskimage_only.yaml index f85ae34dd..63a0e91ec 100644 --- a/nodepool/tests/fixtures/node_diskimage_only.yaml +++ b/nodepool/tests/fixtures/node_diskimage_only.yaml @@ -5,13 +5,6 @@ cron: check: '*/15 * * * *' cleanup: '*/1 * * * *' -zmq-publishers: - - tcp://localhost:8881 - -gearman-servers: - - host: localhost - port: {gearman_port} - zookeeper-servers: - host: {zookeeper_host} port: {zookeeper_port} diff --git a/nodepool/tests/test_builder.py b/nodepool/tests/test_builder.py index a02e3840b..cf9c24c4a 100644 --- a/nodepool/tests/test_builder.py +++ b/nodepool/tests/test_builder.py @@ -278,7 +278,6 @@ class TestNodePoolBuilder(tests.DBTestCase): # Make sure our cleanup worker properly removes the first build. self.waitForBuildDeletion('fake-image', '0000000001') - @skip("Disabled for early v3 development") def test_diskimage_build_only(self): configfile = self.setup_config('node_diskimage_only.yaml') self._useBuilder(configfile) From b5b3ad84d03c1102c2c1e60658bf77e7e060979a Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Mon, 13 Feb 2017 16:01:31 -0500 Subject: [PATCH 054/309] Re-enable test_image_upload_fail test Update test to use new zookeeper syntax. Change-Id: I2ace2f1f9e0827ab391821094674973d75e8b556 Signed-off-by: Paul Belanger --- nodepool/tests/test_builder.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/nodepool/tests/test_builder.py b/nodepool/tests/test_builder.py index cf9c24c4a..476f6998e 100644 --- a/nodepool/tests/test_builder.py +++ b/nodepool/tests/test_builder.py @@ -15,7 +15,6 @@ import os import fixtures -from unittest import skip from nodepool import builder, exceptions, fakeprovider, tests from nodepool import zk @@ -96,7 +95,6 @@ class TestNodePoolBuilder(tests.DBTestCase): nb.start() nb.stop() - @skip("Disabled for early v3 development") def test_image_upload_fail(self): """Test that image upload fails are handled properly.""" @@ -118,7 +116,8 @@ class TestNodePoolBuilder(tests.DBTestCase): self._useBuilder(configfile) pool.start() self.waitForImage('fake-provider', 'fake-image') - self.waitForNodes(pool) + nodes = self.waitForNodes('fake-label') + self.assertEqual(len(nodes), 1) newest_builds = self.zk.getMostRecentBuilds(1, 'fake-image', state=zk.READY) From 386a5923bbe5df9019163344d3d95135e98f3edb Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Wed, 15 Feb 2017 19:26:43 -0500 Subject: [PATCH 055/309] Move ProviderManagers into main NodePool thread Move the ProviderManager objects up from the ProviderWorker threads into the parent NodePool thread. This is preparation for a new child thread that will also need access to the managers. Change-Id: I5a382992280b0e459a1fffe91b10d1d91711309c --- nodepool/cmd/nodepoolcmd.py | 2 +- nodepool/nodepool.py | 96 ++++++++++++++----------------------- 2 files changed, 38 insertions(+), 60 deletions(-) diff --git a/nodepool/cmd/nodepoolcmd.py b/nodepool/cmd/nodepoolcmd.py index 375d47e69..b0425368f 100644 --- a/nodepool/cmd/nodepoolcmd.py +++ b/nodepool/cmd/nodepoolcmd.py @@ -213,7 +213,7 @@ class NodePoolCmd(NodepoolApp): if (self.args.provider and provider.name != self.args.provider): continue - manager = self.pool.getProviderManager(provider) + manager = self.pool.getProviderManager(provider.name) # Build list of provider images as known by the provider provider_images = [] diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 9eee65d81..f55c064e5 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -779,10 +779,10 @@ class NodeRequestHandler(object): :param NodeRequest request: The request to handle. ''' self.log = logging.getLogger("nodepool.NodeRequestHandler") - self.provider = pw.provider - self.zk = pw.zk - self.labels = pw.labels - self.manager = pw.manager + self.provider = pw.getProviderConfig() + self.zk = pw.getZK() + self.labels = pw.getLabelsConfig() + self.manager = pw.getProviderManager() self.launcher_id = pw.launcher_id self.request = request self.launch_manager = None @@ -988,22 +988,17 @@ class ProviderWorker(threading.Thread): that will be recognized and this thread will shut itself down. ''' - def __init__(self, configfile, zk, provider, - watermark_sleep=WATERMARK_SLEEP): + def __init__(self, nodepool, provider_name): threading.Thread.__init__( - self, name='ProviderWorker.%s' % provider.name + self, name='ProviderWorker.%s' % provider_name ) self.log = logging.getLogger("nodepool.%s" % self.name) + self.nodepool = nodepool + self.provider_name = provider_name self.running = False - self.configfile = configfile self.request_handlers = [] - self.watermark_sleep = watermark_sleep - - # These attributes will be used by NodeRequestHandler - self.zk = zk - self.manager = None - self.labels = None - self.provider = provider + self.watermark_sleep = nodepool.watermark_sleep + self.zk = self.getZK() self.launcher_id = "%s-%s-%s" % (socket.gethostname(), os.getpid(), self.ident) @@ -1012,33 +1007,6 @@ class ProviderWorker(threading.Thread): # Private methods #---------------------------------------------------------------- - def _updateProvider(self): - ''' - Update the provider definition from the config file. - - If this provider has been removed from the config, we need to - stop processing the request queue. This will effectively cause - this thread to terminate. - ''' - config = nodepool_config.loadConfig(self.configfile) - self.labels = config.labels - - if self.provider.name not in config.providers.keys(): - self.log.info("Provider %s removed from config" - % self.provider.name) - self.stop() - elif self.provider != config.providers[self.provider.name]: - self.provider = config.providers[self.provider.name] - if self.manager: - self.manager.stop() - self.manager = None - - if not self.manager: - self.log.debug("Creating new ProviderManager") - self.manager = provider_manager.get_provider_manager( - self.provider, use_taskmanager=True) - self.manager.start() - def _activeThreads(self): ''' Return the number of alive threads in use by this provider. @@ -1060,13 +1028,14 @@ class ProviderWorker(threading.Thread): satisfy the request, then return. We will need to periodically poll the handler for completion. ''' - if self.provider.max_concurrency == 0: + provider = self.getProviderConfig() + if provider.max_concurrency == 0: return for req_id in self.zk.getNodeRequests(): # Short-circuit for limited request handling - if (self.provider.max_concurrency > 0 - and self._activeThreads() >= self.provider.max_concurrency + if (provider.max_concurrency > 0 + and self._activeThreads() >= provider.max_concurrency ): return @@ -1112,6 +1081,18 @@ class ProviderWorker(threading.Thread): # Public methods #---------------------------------------------------------------- + def getZK(self): + return self.nodepool.getZK() + + def getProviderConfig(self): + return self.nodepool.config.providers[self.provider_name] + + def getProviderManager(self): + return self.nodepool.getProviderManager(self.provider_name) + + def getLabelsConfig(self): + return self.nodepool.config.labels + def run(self): self.running = True @@ -1124,10 +1105,6 @@ class ProviderWorker(threading.Thread): # Make sure we're always registered with ZK self.zk.registerLauncher(self.launcher_id) - self._updateProvider() - if not self.running: - break - self._assignHandlers() self._removeCompletedHandlers() time.sleep(self.watermark_sleep) @@ -1143,10 +1120,6 @@ class ProviderWorker(threading.Thread): self.log.info("%s received stop" % self.name) self.running = False - if self.manager: - self.manager.stop() - self.manager.join() - class NodePool(threading.Thread): log = logging.getLogger("nodepool.NodePool") @@ -1298,8 +1271,8 @@ class NodePool(threading.Thread): def getZK(self): return self.zk - def getProviderManager(self, provider): - return self.config.provider_managers[provider.name] + def getProviderManager(self, provider_name): + return self.config.provider_managers[provider_name] def getJenkinsManager(self, target): if target.name in self.config.jenkins_managers: @@ -1462,6 +1435,7 @@ class NodePool(threading.Thread): def updateConfig(self): config = self.loadConfig() + provider_manager.ProviderManager.reconfigure(self.config, config) self.reconfigureZooKeeper(config) self.setConfig(config) @@ -1573,20 +1547,24 @@ class NodePool(threading.Thread): self.createMinReady() + # Stop any ProviderWorker threads if the provider was removed + # from the config. + for provider_name in self._provider_threads.keys(): + if provider_name not in self.config.providers.keys(): + self._provider_threads[provider_name].stop() + # Start (or restart) provider threads for each provider in # the config. Removing a provider from the config and then # adding it back would cause a restart. for p in self.config.providers.values(): if p.name not in self._provider_threads.keys(): - t = ProviderWorker(self.configfile, self.zk, p, - self.watermark_sleep) + t = ProviderWorker(self, p.name) self.log.info( "Starting %s" % t.name) t.start() self._provider_threads[p.name] = t elif not self._provider_threads[p.name].isAlive(): self._provider_threads[p.name].join() - t = ProviderWorker(self.configfile, self.zk, p, - self.watermark_sleep) + t = ProviderWorker(self, p.name) self.log.info( "Restarting %s" % t.name) t.start() self._provider_threads[p.name] = t From 59fa7781c72612218d8bad42b0dc7dd3047316c1 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Tue, 14 Feb 2017 10:53:29 -0500 Subject: [PATCH 056/309] Implement node cleanup To prove it works, this also reworks the 'delete' command to use ZooKeeper. Summary: - Re-enables the 'delete' command - Adds waitForNodeDeletion() for testing. - Re-enables tests: - test_node_delete_success - test_delete - test_delete_now - Fixes a bug in Node.__eq__ causing it to fail. Change-Id: I539bca7d2d3d3b90f8e04e9098065e8b6797b194 --- nodepool/cmd/nodepoolcmd.py | 28 ++++--- nodepool/nodepool.py | 131 ++++++++++++++++++++++++++++---- nodepool/status.py | 12 +-- nodepool/tests/__init__.py | 14 ++++ nodepool/tests/test_commands.py | 37 +++++---- nodepool/tests/test_nodepool.py | 42 ++++------ nodepool/tests/test_zk.py | 7 ++ nodepool/zk.py | 17 ++++- 8 files changed, 212 insertions(+), 76 deletions(-) diff --git a/nodepool/cmd/nodepoolcmd.py b/nodepool/cmd/nodepoolcmd.py index b0425368f..ed9ebaab4 100644 --- a/nodepool/cmd/nodepoolcmd.py +++ b/nodepool/cmd/nodepoolcmd.py @@ -18,7 +18,7 @@ import argparse import logging.config import sys -from nodepool import nodedb +from nodepool import provider_manager from nodepool import nodepool from nodepool import status from nodepool import zk @@ -261,17 +261,21 @@ class NodePoolCmd(NodepoolApp): self.list(node_id=self.args.id) def delete(self): + node = self.zk.getNode(self.args.id) + provider = self.pool.config.providers[node.provider] + self.zk.lockNode(node, blocking=True, timeout=5) + if self.args.now: - self.pool.reconfigureManagers(self.pool.config) - with self.pool.getDB().getSession() as session: - node = session.getNode(self.args.id) - if not node: - print "Node %s not found." % self.args.id - elif self.args.now: - self.pool._deleteNode(session, node) - else: - node.state = nodedb.DELETE - self.list(node_id=node.id) + manager = provider_manager.get_provider_manager(provider, True) + manager.start() + nodepool.InstanceDeleter.delete(self.zk, manager, node) + manager.stop() + else: + node.state = zk.DELETING + self.zk.storeNode(node) + self.zk.unlockNode(node) + + self.list(node_id=node.id) def dib_image_delete(self): (image, build_num) = self.args.id.rsplit('-', 1) @@ -354,7 +358,7 @@ class NodePoolCmd(NodepoolApp): if self.args.command in ('image-build', 'dib-image-list', 'image-list', 'dib-image-delete', 'image-delete', 'alien-image-list', - 'list', 'hold'): + 'list', 'hold', 'delete'): self.zk = zk.ZooKeeper() self.zk.connect(config.zookeeper_servers.values()) else: diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index f55c064e5..4830aa8ee 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -174,20 +174,41 @@ class NodeCompleteThread(threading.Thread): class InstanceDeleter(threading.Thread): log = logging.getLogger("nodepool.InstanceDeleter") - def __init__(self, nodepool, provider_name, external_id): + def __init__(self, zk, manager, node): threading.Thread.__init__(self, name='InstanceDeleter for %s %s' % - (provider_name, external_id)) - self.nodepool = nodepool - self.provider_name = provider_name - self.external_id = external_id + (node.provider, node.external_id)) + self._zk = zk + self._manager = manager + self._node = node + + @staticmethod + def delete(zk, manager, node): + ''' + Delete a node. + + This is a class method so we can support instantaneous deletes. + ''' + try: + manager.cleanupServer(node.external_id) + except provider_manager.NotFound: + InstanceDeleter.log.info("Instance %s not found in provider %s", + node.external_id, node.provider) + except Exception: + InstanceDeleter.log.exception( + "Exception deleting instance %s from %s:", + node.external_id, node.provider) + # Don't delete the ZK node in this case, but do unlock it + zk.unlockNode(node) + return + + InstanceDeleter.log.info( + "Deleting ZK node id=%s, state=%s, external_id=%s", + node.id, node.state, node.external_id) + zk.unlockNode(node) + zk.deleteNode(node) def run(self): - try: - self.nodepool._deleteInstance(self.provider_name, - self.external_id) - except Exception: - self.log.exception("Exception deleting instance %s from %s:" % - (self.external_id, self.provider_name)) + self.delete(self._zk, self._manager, self._node) class NodeDeleter(threading.Thread): @@ -1121,26 +1142,95 @@ class ProviderWorker(threading.Thread): self.running = False +class NodeCleanupWorker(threading.Thread): + def __init__(self, nodepool, interval): + threading.Thread.__init__(self, name='NodeCleanupWorker') + self.log = logging.getLogger("nodepool.NodeCleanupWorker") + self._nodepool = nodepool + self._interval = interval + self._running = False + + def _deleteInstance(self, node): + ''' + Delete an instance from a provider. + + A thread will be spawned to delete the actual instance from the + provider. + + :param Node node: A Node object representing the instance to delete. + ''' + self.log.info("Deleting instance %s from %s", + node.external_id, node.provider) + try: + t = InstanceDeleter( + self._nodepool.getZK(), + self._nodepool.getProviderManager(node.provider), + node) + t.start() + except Exception: + self.log.exception("Could not delete instance %s on provider %s", + node.external_id, node.provider) + + def _cleanupNodes(self): + ''' + Delete instances from providers and nodes entries from ZooKeeper. + ''' + # TODO(Shrews): Cleanup alien instances + + zk_conn = self._nodepool.getZK() + for node in zk_conn.nodeIterator(): + # Can't do anything if we aren't configured for this provider. + if node.provider not in self._nodepool.config.providers: + continue + + # Any nodes in these states that are unlocked can be deleted. + if node.state in (zk.USED, zk.IN_USE, zk.BUILDING, zk.DELETING): + try: + zk_conn.lockNode(node, blocking=False) + except exceptions.ZKLockException: + continue + + # The InstanceDeleter thread will unlock and remove the + # node from ZooKeeper if it succeeds. + self._deleteInstance(node) + + def run(self): + self.log.info("Starting") + self._running = True + + while self._running: + try: + self._cleanupNodes() + except Exception: + self.log.exception("Exception in NodeCleanupWorker:") + + time.sleep(self._interval) + + self.log.info("Stopped") + + def stop(self): + self._running = False + self.join() + + class NodePool(threading.Thread): log = logging.getLogger("nodepool.NodePool") + #TODO(Shrews): remove --no-deletes option def __init__(self, securefile, configfile, no_deletes=False, watermark_sleep=WATERMARK_SLEEP): threading.Thread.__init__(self, name='NodePool') self.securefile = securefile self.configfile = configfile - self.no_deletes = no_deletes self.watermark_sleep = watermark_sleep + self.cleanup_interval = 5 self._stopped = False self.config = None self.apsched = None self.zk = None self.statsd = stats.get_client() self._provider_threads = {} - self._delete_threads = {} - self._delete_threads_lock = threading.Lock() - self._instance_delete_threads = {} - self._instance_delete_threads_lock = threading.Lock() + self._cleanup_thread = None self._wake_condition = threading.Condition() self._submittedRequests = {} @@ -1154,6 +1244,10 @@ class NodePool(threading.Thread): if self.apsched and self.apsched.running: self.apsched.shutdown() + if self._cleanup_thread: + self._cleanup_thread.stop() + self._cleanup_thread.join() + # Don't let stop() return until all provider threads have been # terminated. self.log.debug("Stopping provider threads") @@ -1547,6 +1641,11 @@ class NodePool(threading.Thread): self.createMinReady() + if not self._cleanup_thread: + self._cleanup_thread = NodeCleanupWorker( + self, self.cleanup_interval) + self._cleanup_thread.start() + # Stop any ProviderWorker threads if the provider was removed # from the config. for provider_name in self._provider_threads.keys(): diff --git a/nodepool/status.py b/nodepool/status.py index 934d42398..8946d0306 100644 --- a/nodepool/status.py +++ b/nodepool/status.py @@ -37,13 +37,13 @@ def node_list(zk, node_id=None): t.align = 'l' if node_id: node = zk.getNode(node_id) - t.add_row([node.id, node.provider, node.az, node.type, - node.launcher, node.hostname, node.external_id, - node.public_ipv4, node.private_ipv4, node.public_ipv6, - node.state, age(node.state_time), node.comment]) + if node: + t.add_row([node.id, node.provider, node.az, node.type, + node.launcher, node.hostname, node.external_id, + node.public_ipv4, node.private_ipv4, node.public_ipv6, + node.state, age(node.state_time), node.comment]) else: - for nid in zk.getNodes(): - node = zk.getNode(nid) + for node in zk.nodeIterator(): t.add_row([node.id, node.provider, node.az, node.type, node.launcher, node.hostname, node.external_id, node.public_ipv4, node.private_ipv4, node.public_ipv6, diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py index 18ff602c8..84deaf5e3 100644 --- a/nodepool/tests/__init__.py +++ b/nodepool/tests/__init__.py @@ -192,6 +192,8 @@ class BaseTestCase(testtools.TestCase): continue if t.name.startswith("NodeLauncher"): continue + if t.name.startswith("NodeCleanupWorker"): + continue if t.name not in whitelist: done = False if done: @@ -420,6 +422,17 @@ class DBTestCase(BaseTestCase): self.wait_for_threads() + def waitForNodeDeletion(self, node): + while True: + exists = False + for n in self.zk.nodeIterator(): + if node.id == n.id: + exists = True + break + if not exists: + break + time.sleep(1) + def waitForNodes(self, label, count=1): while True: self.wait_for_threads() @@ -445,6 +458,7 @@ class DBTestCase(BaseTestCase): def useNodepool(self, *args, **kwargs): args = (self.secure_conf,) + args pool = nodepool.NodePool(*args, **kwargs) + pool.cleanup_interval = .5 self.addCleanup(pool.stop) return pool diff --git a/nodepool/tests/test_commands.py b/nodepool/tests/test_commands.py index b87014b10..81a01300b 100644 --- a/nodepool/tests/test_commands.py +++ b/nodepool/tests/test_commands.py @@ -212,36 +212,47 @@ class TestNodepoolCMD(tests.DBTestCase): self.assert_listed(configfile, ['list'], 0, node_id, 1) self.assert_nodes_listed(configfile, 1, 'hold') - @skip("Disabled for early v3 development") def test_delete(self): configfile = self.setup_config('node.yaml') pool = self.useNodepool(configfile, watermark_sleep=1) self._useBuilder(configfile) pool.start() self.waitForImage('fake-provider', 'fake-image') - self.waitForNodes(pool) - # Assert one node exists and it is node 1 in a ready state. - self.assert_listed(configfile, ['list'], 0, 1, 1) - self.assert_nodes_listed(configfile, 1, zk.READY) - # Delete node 1 - self.assert_listed(configfile, ['delete', '1'], 10, 'delete', 1) + nodes = self.waitForNodes('fake-label') + self.assertEqual(len(nodes), 1) + + # Assert one node exists and it is nodes[0].id in a ready state. + self.assert_listed(configfile, ['list'], 0, nodes[0].id, 1) + self.assert_nodes_listed(configfile, 1, zk.READY) + + # Delete node + self.patch_argv('-c', configfile, 'delete', nodes[0].id) + nodepoolcmd.main() + self.waitForNodeDeletion(nodes[0]) + + # Assert the node is gone + self.assert_listed(configfile, ['list'], 0, nodes[0].id, 0) - @skip("Disabled for early v3 development") def test_delete_now(self): configfile = self.setup_config('node.yaml') pool = self.useNodepool(configfile, watermark_sleep=1) self._useBuilder(configfile) pool.start() self.waitForImage( 'fake-provider', 'fake-image') - self.waitForNodes(pool) + nodes = self.waitForNodes('fake-label') + self.assertEqual(len(nodes), 1) + # Assert one node exists and it is node 1 in a ready state. - self.assert_listed(configfile, ['list'], 0, 1, 1) + self.assert_listed(configfile, ['list'], 0, nodes[0].id, 1) self.assert_nodes_listed(configfile, 1, zk.READY) - # Delete node 1 - self.patch_argv('-c', configfile, 'delete', '--now', '1') + + # Delete node + self.patch_argv('-c', configfile, 'delete', '--now', nodes[0].id) nodepoolcmd.main() + self.waitForNodeDeletion(nodes[0]) + # Assert the node is gone - self.assert_listed(configfile, ['list'], 0, 1, 0) + self.assert_listed(configfile, ['list'], 0, nodes[0].id, 0) def test_image_build(self): configfile = self.setup_config('node.yaml') diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index 590fe1067..f2e88e36c 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -248,42 +248,28 @@ class TestNodepool(tests.DBTestCase): self.assertEqual(len(nodes), 1) self.assertEqual(nodes[0].ip, 'fake') - @skip("Disabled for early v3 development") def test_node_delete_success(self): configfile = self.setup_config('node.yaml') pool = self.useNodepool(configfile, watermark_sleep=1) self._useBuilder(configfile) pool.start() self.waitForImage('fake-provider', 'fake-image') - self.waitForNodes(pool) - node_id = -1 - with pool.getDB().getSession() as session: - nodes = session.getNodes(provider_name='fake-provider', - label_name='fake-label', - target_name='fake-target', - state=nodedb.READY) - self.assertEqual(len(nodes), 1) - node_id = nodes[0].id + nodes = self.waitForNodes('fake-label') + self.assertEqual(len(nodes), 1) + self.assertEqual(zk.READY, nodes[0].state) + self.assertEqual('fake-provider', nodes[0].provider) + nodes[0].state = zk.DELETING + self.zk.storeNode(nodes[0]) - pool.deleteNode(node_id) - self.wait_for_threads() - self.waitForNodes(pool) + # Wait for this one to be deleted + self.waitForNodeDeletion(nodes[0]) - with pool.getDB().getSession() as session: - ready_nodes = session.getNodes(provider_name='fake-provider', - label_name='fake-label', - target_name='fake-target', - state=nodedb.READY) - deleted_nodes = session.getNodes(provider_name='fake-provider', - label_name='fake-label', - target_name='fake-target', - state=nodedb.DELETE) - # Make sure we have one node which is a new node - self.assertEqual(len(ready_nodes), 1) - self.assertNotEqual(node_id, ready_nodes[0].id) - - # Make sure our old node was deleted - self.assertEqual(len(deleted_nodes), 0) + # Wait for a new one to take it's place + new_nodes = self.waitForNodes('fake-label') + self.assertEqual(len(new_nodes), 1) + self.assertEqual(zk.READY, new_nodes[0].state) + self.assertEqual('fake-provider', new_nodes[0].provider) + self.assertNotEqual(nodes[0], new_nodes[0]) @skip("Disabled for early v3 development") def test_node_delete_failure(self): diff --git a/nodepool/tests/test_zk.py b/nodepool/tests/test_zk.py index 1aad83ef3..e23de4f4b 100644 --- a/nodepool/tests/test_zk.py +++ b/nodepool/tests/test_zk.py @@ -553,6 +553,13 @@ class TestZooKeeper(tests.DBTestCase): self.zk.client.exists(self.zk._requestPath(req.id)) ) + def test_deleteNode(self): + n1 = self._create_node() + self.zk.deleteNode(n1) + self.assertIsNone( + self.zk.client.exists(self.zk._nodePath(n1.id)) + ) + def test_getReadyNodesOfTypes(self): n1 = self._create_node() n1.type = 'label1' diff --git a/nodepool/zk.py b/nodepool/zk.py index eac6c451f..24d3de288 100644 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -414,7 +414,7 @@ class Node(BaseModel): self.launcher == other.launcher and self.created_time == other.created_time and self.external_id == other.external_id and - self.hostname == other.hostname, + self.hostname == other.hostname and self.comment == other.comment) else: return False @@ -1453,6 +1453,21 @@ class ZooKeeper(object): path = self._nodePath(node.id) self.client.set(path, node.serialize()) + def deleteNode(self, node): + ''' + Delete a node. + + :param Node node: The Node object representing the ZK node to delete. + ''' + if not node.id: + return + + path = self._nodePath(node.id) + try: + self.client.delete(path, recursive=True) + except kze.NoNodeError: + pass + def getReadyNodesOfTypes(self, labels): ''' Query ZooKeeper for unused/ready nodes. From d884a35937ed0dd93f60ea643eedbf844acf1922 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Thu, 9 Feb 2017 12:19:43 -0500 Subject: [PATCH 057/309] Re-enable devstack test job Change-Id: I8073a3706eb16fe0a271ea59f6381f363ead0e5b --- tools/check_devstack_plugin.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/check_devstack_plugin.sh b/tools/check_devstack_plugin.sh index f86692a1f..01ddcb2dc 100755 --- a/tools/check_devstack_plugin.sh +++ b/tools/check_devstack_plugin.sh @@ -40,8 +40,6 @@ function waitfornode { done } -exit 0 - if [ $NODEPOOL_PAUSE_CENTOS_7_DIB = 'false' ]; then # check that image built waitforimage centos-7 @@ -82,7 +80,7 @@ set -o errexit $NODEPOOL list # Try to delete the nodes that were just built -$NODEPOOL delete --now 1 +$NODEPOOL delete --now 0000000000 # show the deleted nodes (and their replacements may be building) $NODEPOOL list From 4e6dedfd0f9ae6b7a16ac5c382054837527afb6a Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Wed, 15 Feb 2017 12:05:03 -0500 Subject: [PATCH 058/309] Add lock state to node listing Also make waitfornodes in the devstack script to wait for unlocked nodes. Change-Id: I76f35d46f66f9e3beafe9f5f5a93049256a8df44 --- nodepool/status.py | 21 ++++++++++++++++++--- tools/check_devstack_plugin.sh | 2 +- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/nodepool/status.py b/nodepool/status.py index 8946d0306..18846da9e 100644 --- a/nodepool/status.py +++ b/nodepool/status.py @@ -33,21 +33,36 @@ def node_list(zk, node_id=None): t = PrettyTable(["ID", "Provider", "AZ", "Label", "Launcher", "Hostname", "Server ID", "Public IPv4", "Private IPv4", "IPv6", - "State", "Age", "Comment"]) + "State", "Age", "Locked", "Comment"]) t.align = 'l' if node_id: node = zk.getNode(node_id) if node: + locked = "unlocked" + try: + zk.lockNode(node, blocking=False) + except Exception: + locked = "locked" + else: + zk.unlockNode(node) + t.add_row([node.id, node.provider, node.az, node.type, node.launcher, node.hostname, node.external_id, node.public_ipv4, node.private_ipv4, node.public_ipv6, - node.state, age(node.state_time), node.comment]) + node.state, age(node.state_time), locked, node.comment]) else: for node in zk.nodeIterator(): + locked = "unlocked" + try: + zk.lockNode(node, blocking=False) + except Exception: + locked = "locked" + else: + zk.unlockNode(node) t.add_row([node.id, node.provider, node.az, node.type, node.launcher, node.hostname, node.external_id, node.public_ipv4, node.private_ipv4, node.public_ipv6, - node.state, age(node.state_time), node.comment]) + node.state, age(node.state_time), locked, node.comment]) return str(t) diff --git a/tools/check_devstack_plugin.sh b/tools/check_devstack_plugin.sh index 01ddcb2dc..3c9b7bc02 100755 --- a/tools/check_devstack_plugin.sh +++ b/tools/check_devstack_plugin.sh @@ -31,7 +31,7 @@ function waitfornode { name=$1 state='ready' - while ! $NODEPOOL list | grep $name | grep $state; do + while ! $NODEPOOL list | grep $name | grep $state | grep "unlocked"; do $NODEPOOL image-list > /tmp/.nodepool-image-list.txt $NODEPOOL list > /tmp/.nodepool-list.txt sudo mv /tmp/.nodepool-image-list.txt $WORKSPACE/logs/nodepool-image-list.txt From ae20ff1792cfc7b13f45c5f1d07bba2a8a6a1614 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Thu, 16 Feb 2017 15:05:13 -0500 Subject: [PATCH 059/309] Wait for server deletion before deleting ZK node Change-Id: I2f8531cd2ffc027fe6d2d1064b3cd42f8b1845b1 --- nodepool/nodepool.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 4830aa8ee..5576fd018 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -190,6 +190,7 @@ class InstanceDeleter(threading.Thread): ''' try: manager.cleanupServer(node.external_id) + manager.waitForServerDeletion(node.external_id) except provider_manager.NotFound: InstanceDeleter.log.info("Instance %s not found in provider %s", node.external_id, node.provider) From 64b39bf970182f7e0dcb0fe2b0545803775ff5d1 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Thu, 16 Feb 2017 15:15:36 -0500 Subject: [PATCH 060/309] Partial dead code deletion Most of the unused code around node deletion can now be removed. Change-Id: I88262abae183c014b7cfa05517794252f0238dcd --- nodepool/nodepool.py | 139 ------------------------------------------- requirements.txt | 1 - 2 files changed, 140 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 5576fd018..dac12e6a4 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -16,8 +16,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import apscheduler.schedulers.background -import apscheduler.triggers.cron import json import logging import os @@ -1309,35 +1307,6 @@ class NodePool(threading.Thread): t.name) t.online = False - def reconfigureCrons(self, config): - cron_map = { - 'cleanup': self._doPeriodicCleanup, - 'check': self._doPeriodicCheck, - } - - if not self.apsched: - self.apsched = apscheduler.schedulers.background.BackgroundScheduler() - self.apsched.start() - - for c in config.crons.values(): - if ((not self.config) or - c.timespec != self.config.crons[c.name].timespec): - if self.config and self.config.crons[c.name].job: - self.config.crons[c.name].job.remove() - parts = c.timespec.split() - if len(parts) > 5: - second = parts[5] - else: - second = None - minute, hour, dom, month, dow = parts[:5] - trigger = apscheduler.triggers.cron.CronTrigger( - month=month, day=dom, day_of_week=dow, - hour=hour, minute=minute, second=second) - c.job = self.apsched.add_job( - cron_map[c.name], trigger=trigger) - else: - c.job = self.config.crons[c.name].job - def reconfigureZooKeeper(self, config): if self.config: running = self.config.zookeeper_servers.values() @@ -1714,19 +1683,6 @@ class NodePool(threading.Thread): launch_timeout) t.start() - def deleteNode(self, node_id): - try: - self._delete_threads_lock.acquire() - if node_id in self._delete_threads: - return - t = NodeDeleter(self, node_id) - self._delete_threads[node_id] = t - t.start() - except Exception: - self.log.exception("Could not delete node %s", node_id) - finally: - self._delete_threads_lock.release() - def _deleteNode(self, session, node): self.log.debug("Deleting node id: %s which has been in %s " "state for %s hours" % @@ -1782,72 +1738,6 @@ class NodePool(threading.Thread): self.statsd.incr(key) self.updateStats(session, node.provider_name) - def deleteInstance(self, provider_name, external_id): - key = (provider_name, external_id) - try: - self._instance_delete_threads_lock.acquire() - if key in self._instance_delete_threads: - return - t = InstanceDeleter(self, provider_name, external_id) - self._instance_delete_threads[key] = t - t.start() - except Exception: - self.log.exception("Could not delete instance %s on provider %s", - provider_name, external_id) - finally: - self._instance_delete_threads_lock.release() - - def _deleteInstance(self, provider_name, external_id): - provider = self.config.providers[provider_name] - manager = self.getProviderManager(provider) - manager.cleanupServer(external_id) - - def _doPeriodicCleanup(self): - if self.no_deletes: - return - try: - self.periodicCleanup() - except Exception: - self.log.exception("Exception in periodic cleanup:") - - def periodicCleanup(self): - # This function should be run periodically to clean up any hosts - # that may have slipped through the cracks, as well as to remove - # old images. - - self.log.debug("Starting periodic cleanup") - - for k, t in self._delete_threads.items()[:]: - if not t.isAlive(): - del self._delete_threads[k] - - for k, t in self._instance_delete_threads.items()[:]: - if not t.isAlive(): - del self._instance_delete_threads[k] - - node_ids = [] - with self.getDB().getSession() as session: - for node in session.getNodes(): - node_ids.append(node.id) - - for node_id in node_ids: - try: - with self.getDB().getSession() as session: - node = session.getNode(node_id) - if node: - self.cleanupOneNode(session, node) - except Exception: - self.log.exception("Exception cleaning up node id %s:" % - node_id) - - try: - self.cleanupLeakedInstances() - pass - except Exception: - self.log.exception("Exception cleaning up leaked nodes") - - self.log.debug("Finished periodic cleanup") - def cleanupLeakedInstances(self): known_providers = self.config.providers.keys() for provider in self.config.providers.values(): @@ -1888,35 +1778,6 @@ class NodePool(threading.Thread): if provider.clean_floating_ips: manager.cleanupLeakedFloaters() - def cleanupOneNode(self, session, node): - now = time.time() - time_in_state = now - node.state_time - if (node.state in [nodedb.READY, nodedb.HOLD]): - return - delete = False - if (node.state == nodedb.DELETE): - delete = True - elif (node.state == nodedb.TEST and - time_in_state > TEST_CLEANUP): - delete = True - elif time_in_state > NODE_CLEANUP: - delete = True - if delete: - try: - self.deleteNode(node.id) - except Exception: - self.log.exception("Exception deleting node id: " - "%s" % node.id) - - def _doPeriodicCheck(self): - if self.no_deletes: - return - try: - with self.getDB().getSession() as session: - self.periodicCheck(session) - except Exception: - self.log.exception("Exception in periodic check:") - def periodicCheck(self, session): # This function should be run periodically to make sure we can # still access hosts via ssh. diff --git a/requirements.txt b/requirements.txt index 324e4e45f..b6f254c00 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,6 @@ paramiko>1.11.6,<2.0.0 python-daemon>=2.0.4,<2.1.0 extras statsd>=3.0 -apscheduler>=3.0 sqlalchemy>=0.8.2,<1.1.0 PyMySQL PrettyTable>=0.6,<0.8 From b2d053d06c5ca5ca5473b45113c31a572acdde26 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Thu, 16 Feb 2017 16:27:26 -0500 Subject: [PATCH 061/309] Use thread name in launcher ID The Thread.ident attribute is apparently returning None for some reason in tests, causing our launchers to share an ID. Change it to Thread.name, which we know has a value. Change-Id: Ib5c54804224ffd147372b2d1cfa6b62cc8a8b4cc --- nodepool/nodepool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index dac12e6a4..14091ccfc 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -1021,7 +1021,7 @@ class ProviderWorker(threading.Thread): self.zk = self.getZK() self.launcher_id = "%s-%s-%s" % (socket.gethostname(), os.getpid(), - self.ident) + self.name) #---------------------------------------------------------------- # Private methods From b740c8907e45e830f5092b1a592cb62da8f7637f Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Fri, 17 Feb 2017 09:44:28 -0500 Subject: [PATCH 062/309] Disable test_image_upload_fail This test is currently flapping and needs to be reworked. Disable again as not to side track the PTG effort. Change-Id: I23ed4024fc3d2b906c2ae4702042ed65ce020a6b Signed-off-by: Paul Belanger --- nodepool/tests/test_builder.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nodepool/tests/test_builder.py b/nodepool/tests/test_builder.py index 476f6998e..9d95f45a1 100644 --- a/nodepool/tests/test_builder.py +++ b/nodepool/tests/test_builder.py @@ -15,6 +15,7 @@ import os import fixtures +from unittest import skip from nodepool import builder, exceptions, fakeprovider, tests from nodepool import zk @@ -95,6 +96,7 @@ class TestNodePoolBuilder(tests.DBTestCase): nb.start() nb.stop() + @skip("Disabled for early v3 development") def test_image_upload_fail(self): """Test that image upload fails are handled properly.""" From 08f0a2d827232b4a9912fd5aff64887242afa292 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Thu, 16 Feb 2017 16:36:08 -0500 Subject: [PATCH 063/309] Fix possible race with node request handling We were properly double checking the state after locking the request, but we weren't actually querying ZK to get the updated request info. Change-Id: I5764bd53478b94fdb4dd99fb04731f97cc203f73 --- nodepool/nodepool.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 14091ccfc..1c3677473 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -1076,8 +1076,9 @@ class ProviderWorker(threading.Thread): except exceptions.ZKLockException: continue - # Make sure the state didn't change on us - if req.state != zk.REQUESTED: + # Make sure the state didn't change on us after getting the lock + req2 = self.zk.getNodeRequest(req_id) + if req2 and req2.state != zk.REQUESTED: self.zk.unlockNodeRequest(req) continue From 78dcd29fa398fc1ec8bf23c8c74e0fdce6581169 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Fri, 17 Feb 2017 10:02:07 -0500 Subject: [PATCH 064/309] Disable CleanupWorker thread for test_image_upload_fail We currently have a race condition between our cleanup worker and our unit test. My hope is, if we agree to disable the CleanupWorker thread for the test, we still consider this a valid test. Change-Id: I04b87ef044de7f99cc9cbd0c08747e53d383693b Signed-off-by: Paul Belanger --- nodepool/builder.py | 28 +++++++++++++--------------- nodepool/tests/__init__.py | 9 +++++---- nodepool/tests/test_builder.py | 6 +++--- 3 files changed, 21 insertions(+), 22 deletions(-) diff --git a/nodepool/builder.py b/nodepool/builder.py index a3457f135..a1802b4a2 100644 --- a/nodepool/builder.py +++ b/nodepool/builder.py @@ -1114,18 +1114,19 @@ class NodePoolBuilder(object): w.start() self._upload_workers.append(w) - self._janitor = CleanupWorker(0, self._config_path, - self.cleanup_interval, self.zk) - self._janitor.start() + if self.cleanup_interval > 0: + self._janitor = CleanupWorker( + 0, self._config_path, self.cleanup_interval, self.zk) + self._janitor.start() # Wait until all threads are running. Otherwise, we have a race # on the worker _running attribute if shutdown() is called before # run() actually begins. + workers = self._build_workers + self._upload_workers + if self._janitor: + workers += [self._janitor] while not all([ - x.running for x in (self._build_workers - + self._upload_workers - + [self._janitor]) - ]): + x.running for x in (workers)]): time.sleep(0) def stop(self): @@ -1138,10 +1139,10 @@ class NodePoolBuilder(object): ''' with self._start_lock: self.log.debug("Stopping. NodePoolBuilder shutting down workers") - for worker in (self._build_workers - + self._upload_workers - + [self._janitor] - ): + workers = self._build_workers + self._upload_workers + if self._janitor: + workers += [self._janitor] + for worker in (workers): worker.shutdown() self._running = False @@ -1149,10 +1150,7 @@ class NodePoolBuilder(object): self.log.debug('Waiting for jobs to complete') # Do not exit until all of our owned threads exit. - for worker in (self._build_workers - + self._upload_workers - + [self._janitor] - ): + for worker in (workers): worker.join() self.log.debug('Terminating ZooKeeper connection') diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py index 84deaf5e3..241e414fd 100644 --- a/nodepool/tests/__init__.py +++ b/nodepool/tests/__init__.py @@ -272,15 +272,16 @@ class MySQLSchemaFixture(fixtures.Fixture): class BuilderFixture(fixtures.Fixture): - def __init__(self, configfile): + def __init__(self, configfile, cleanup_interval): super(BuilderFixture, self).__init__() self.configfile = configfile + self.cleanup_interval = cleanup_interval self.builder = None def setUp(self): super(BuilderFixture, self).setUp() self.builder = builder.NodePoolBuilder(self.configfile) - self.builder.cleanup_interval = .5 + self.builder.cleanup_interval = self.cleanup_interval self.builder.build_interval = .1 self.builder.upload_interval = .1 self.builder.dib_cmd = 'nodepool/tests/fake-image-create' @@ -467,8 +468,8 @@ class DBTestCase(BaseTestCase): self.addCleanup(app.stop) return app - def _useBuilder(self, configfile): - self.useFixture(BuilderFixture(configfile)) + def _useBuilder(self, configfile, cleanup_interval=.5): + self.useFixture(BuilderFixture(configfile, cleanup_interval)) def setupZK(self): f = ZookeeperServerFixture() diff --git a/nodepool/tests/test_builder.py b/nodepool/tests/test_builder.py index 9d95f45a1..63564e6d2 100644 --- a/nodepool/tests/test_builder.py +++ b/nodepool/tests/test_builder.py @@ -15,7 +15,6 @@ import os import fixtures -from unittest import skip from nodepool import builder, exceptions, fakeprovider, tests from nodepool import zk @@ -96,7 +95,6 @@ class TestNodePoolBuilder(tests.DBTestCase): nb.start() nb.stop() - @skip("Disabled for early v3 development") def test_image_upload_fail(self): """Test that image upload fails are handled properly.""" @@ -115,7 +113,9 @@ class TestNodePoolBuilder(tests.DBTestCase): configfile = self.setup_config('node.yaml') pool = self.useNodepool(configfile, watermark_sleep=1) - self._useBuilder(configfile) + # NOTE(pabelanger): Disable CleanupWorker thread for nodepool-builder + # as we currently race it to validate our failed uploads. + self._useBuilder(configfile, cleanup_interval=0) pool.start() self.waitForImage('fake-provider', 'fake-image') nodes = self.waitForNodes('fake-label') From 8f46532c542902b8022782f0fd023e98862e9c4a Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Mon, 20 Feb 2017 11:07:50 -0500 Subject: [PATCH 065/309] Bug fix: Failed nodes were not getting unlocked We were resetting the nodeset before unlocking it if we had failed nodes, leaving the nodes in a LOCKED state, which is no good. There's no reason to reset the nodeset, so just remove that line. Change-Id: Ic290682f0efb6b8ca8587ae5ecbda806f2745667 --- nodepool/nodepool.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 1c3677473..be7e077c7 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -977,7 +977,6 @@ class NodeRequestHandler(object): return True if self.launch_manager.failed_nodes: - self.nodeset = [] self.request.declined_by.append(self.launcher_id) launchers = set(self.zk.getRegisteredLaunchers()) if launchers.issubset(set(self.request.declined_by)): From 079204ef0b320cb20b4af0741222e6bd8a7b9605 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Mon, 20 Feb 2017 11:42:01 -0500 Subject: [PATCH 066/309] Bug fix: Delete FAILED nodes Change-Id: I6db31e21adb6924374b973ad1a3c99212a83acbe --- nodepool/nodepool.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index be7e077c7..b2bc17b02 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -1183,7 +1183,8 @@ class NodeCleanupWorker(threading.Thread): continue # Any nodes in these states that are unlocked can be deleted. - if node.state in (zk.USED, zk.IN_USE, zk.BUILDING, zk.DELETING): + if node.state in (zk.USED, zk.IN_USE, zk.BUILDING, zk.FAILED, + zk.DELETING): try: zk_conn.lockNode(node, blocking=False) except exceptions.ZKLockException: From ce6a9cd646dfdb21fbc8e1b6a8a523f973241783 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Mon, 20 Feb 2017 12:35:01 -0500 Subject: [PATCH 067/309] Bug Fix: Call _runReadyScript correctly Change-Id: I2379be3726e7ea7d2c6dac8fb2b6cf9a806d1289 --- nodepool/nodepool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index b2bc17b02..101254cee 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -623,7 +623,7 @@ class NodeLauncher(threading.Thread): self._writeNodepoolInfo(host, preferred_ip, self._node) if self._label.ready_script: - self.runReadyScript(host, hostname, self._label.ready_script) + self._runReadyScript(host, hostname, self._label.ready_script) def _writeNodepoolInfo(self, host, preferred_ip, node): key = paramiko.RSAKey.generate(2048) From b040984b604a86cca04fa4609ef95e551aeabcdd Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Mon, 20 Feb 2017 12:51:24 -0500 Subject: [PATCH 068/309] Bug fix: Delete instance during launch retries If we launched an instance, and we find a problem with it (cannot ssh in, for example), we should delete it before another launch attempt. Change-Id: I784997aabf9457f8b59f14620ec6ec069734b122 --- nodepool/nodepool.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 101254cee..4d9c85336 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -681,6 +681,14 @@ class NodeLauncher(threading.Thread): except Exception: self.log.exception("Launch attempt %d/%d failed for node %s:", attempts, self._retries, self._node.id) + # If we created an instance, delete it. + if self._node.external_id: + self._manager.cleanupServer(self._node.external_id) + self._manager.waitForServerDeletion(self._node.external_id) + self._node.external_id = None + self._node.public_ipv4 = None + self._node.public_ipv6 = None + self._zk.storeNode(self._node) if attempts == self._retries: raise attempts += 1 From dc0b734e66fdfb90eabd599d7e9a4a2d78676e01 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Mon, 20 Feb 2017 10:00:51 -0500 Subject: [PATCH 069/309] Send min-ready requests only when images are ready Do not send min-ready requests until images are available. This helps prevent repeated request failures while waiting for images to upload to the provider. Change-Id: I360f6079b5c04c06a59e3766f61126f038714a78 --- nodepool/nodepool.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 1c3677473..6d3e6beb7 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -1544,6 +1544,20 @@ class NodePool(threading.Thread): "No more active min-ready requests for label %s", label) del self._submittedRequests[label] + def labelImageIsAvailable(self, label): + ''' + Check if the image associated with a label is ready in any provider. + + :param Label label: The label config object. + + :returns: True if image associated with the label is uploaded and + ready in at least one provider. False otherwise. + ''' + for provider_name in label.providers.keys(): + if self.zk.getMostRecentImageUpload(label.image, provider_name): + return True + return False + def createMinReady(self): ''' Create node requests to make the minimum amount of ready nodes. @@ -1587,7 +1601,7 @@ class NodePool(threading.Thread): elif len(ready_nodes[label.name]) < min_ready: need = min_ready - len(ready_nodes[label.name]) - if need: + if need and self.labelImageIsAvailable(label): # Create requests for 1 node at a time. This helps to split # up requests across providers, and avoids scenario where a # single provider might fail the entire request because of From 39aec713a34f01ee84f03427c66ed4908b48c49d Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Mon, 20 Feb 2017 13:44:45 -0500 Subject: [PATCH 070/309] Check for not found node in 'delete' command Display a friendly message when a node is not found for the nodepool client 'delete' command. Change-Id: I169f7e1b7882020273ef9a2780cf8e7236ffebea --- nodepool/cmd/nodepoolcmd.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nodepool/cmd/nodepoolcmd.py b/nodepool/cmd/nodepoolcmd.py index ed9ebaab4..a8c52f575 100644 --- a/nodepool/cmd/nodepoolcmd.py +++ b/nodepool/cmd/nodepoolcmd.py @@ -262,6 +262,10 @@ class NodePoolCmd(NodepoolApp): def delete(self): node = self.zk.getNode(self.args.id) + if not node: + print("Node id %s not found" % self.args.id) + return + provider = self.pool.config.providers[node.provider] self.zk.lockNode(node, blocking=True, timeout=5) From 646c48800be66149649408745efb444eea5ea940 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Mon, 20 Feb 2017 15:17:40 -0500 Subject: [PATCH 071/309] Remove verbose logging related to config loading Change-Id: I9e70b661ba50e878ed39e1f5a3619fcc8fbcb6f8 --- nodepool/nodepool.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 4d9c85336..c26ba1094 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -1270,7 +1270,6 @@ class NodePool(threading.Thread): self.log.debug("Finished stopping") def loadConfig(self): - self.log.debug("Loading configuration") config = nodepool_config.loadConfig(self.configfile) nodepool_config.loadSecureConfig(config, self.securefile) return config @@ -1324,7 +1323,6 @@ class NodePool(threading.Thread): configured = config.zookeeper_servers.values() if running == configured: - self.log.debug("Zookeeper client does not need to be updated") return if not self.zk and configured: From ac716b87d14fcd5c1b3966f1d4042d494bfdf4c3 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Tue, 21 Feb 2017 09:20:10 -0500 Subject: [PATCH 072/309] Node requests should be ephemeral Node requests are supposed to be ephemeral so that if the requestor dies, the request becomes invalid and disappears. Change-Id: I30ffc10247c06ebfe01e4e707eb7b6caa7e4f85e --- nodepool/zk.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nodepool/zk.py b/nodepool/zk.py index 24d3de288..440d60266 100644 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -1270,6 +1270,7 @@ class ZooKeeper(object): path = self.client.create( path, value=request.serialize(), + ephemeral=True, sequence=True, makepath=True) request.id = path.split("/")[-1] From 71035081d5b57decb4f3a30070905883d433d64e Mon Sep 17 00:00:00 2001 From: Jamie Lennox Date: Tue, 21 Feb 2017 10:15:31 -0500 Subject: [PATCH 073/309] Allow configuring nodepool launch retries Nodepool currently hardcodes that 3 attempts are made to upload an image to the cloud. Allow modifying this in your provider configuration. Change-Id: I61f44e163d419771824daa2039f7cdecc74742aa --- doc/source/configuration.rst | 8 +++ nodepool/cmd/config_validator.py | 1 + nodepool/config.py | 1 + nodepool/nodepool.py | 3 +- .../tests/fixtures/config_validate/good.yaml | 1 + .../tests/fixtures/node_launch_retry.yaml | 55 +++++++++++++++++++ nodepool/tests/test_nodepool.py | 21 +++++++ 7 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 nodepool/tests/fixtures/node_launch_retry.yaml diff --git a/doc/source/configuration.rst b/doc/source/configuration.rst index 1f4eea811..82a5b0016 100644 --- a/doc/source/configuration.rst +++ b/doc/source/configuration.rst @@ -296,6 +296,7 @@ provider, the Nodepool image types are also defined (see - az1 boot-timeout: 120 launch-timeout: 900 + launch-retries: 3 image-name-format: 'template-{image_name}-{timestamp}' hostname-format: '{label.name}-{provider.name}-{node.id}' ipv6-preferred: False @@ -413,6 +414,13 @@ provider, the Nodepool image types are also defined (see In seconds. Default 3600. + ``launch-retries`` + + The number of times to retry launching a server before considering the job + failed. + + Default 3. + ``keypair`` Default None diff --git a/nodepool/cmd/config_validator.py b/nodepool/cmd/config_validator.py index 65afd65cc..f49ffe3d2 100644 --- a/nodepool/cmd/config_validator.py +++ b/nodepool/cmd/config_validator.py @@ -74,6 +74,7 @@ class ConfigValidator: 'boot-timeout': int, 'api-timeout': int, 'launch-timeout': int, + 'launch-retries': int, 'rate': float, 'images': [images], 'hostname-format': str, diff --git a/nodepool/config.py b/nodepool/config.py index 7a49152ba..0f76c551b 100644 --- a/nodepool/config.py +++ b/nodepool/config.py @@ -182,6 +182,7 @@ def loadConfig(config_path): p.api_timeout = provider.get('api-timeout') p.boot_timeout = provider.get('boot-timeout', 60) p.launch_timeout = provider.get('launch-timeout', 3600) + p.launch_retries = provider.get('launch-retries', 3) p.networks = [] for network in provider.get('networks', []): n = Network() diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index c26ba1094..c55f130ad 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -900,7 +900,8 @@ class NodeRequestHandler(object): self.zk.storeNodeRequest(self.request) self.launch_manager = NodeLaunchManager( - self.zk, self.provider, self.labels, self.manager, retries=3) + self.zk, self.provider, self.labels, self.manager, + retries=self.provider.launch_retries) ready_nodes = self.zk.getReadyNodesOfTypes(self.request.node_types) for ntype in self.request.node_types: diff --git a/nodepool/tests/fixtures/config_validate/good.yaml b/nodepool/tests/fixtures/config_validate/good.yaml index 087bdfa9e..0da775bdf 100644 --- a/nodepool/tests/fixtures/config_validate/good.yaml +++ b/nodepool/tests/fixtures/config_validate/good.yaml @@ -38,6 +38,7 @@ providers: boot-timeout: 120 max-servers: 184 max-concurrency: 10 + launch-retries: 3 rate: 0.001 images: - name: trusty diff --git a/nodepool/tests/fixtures/node_launch_retry.yaml b/nodepool/tests/fixtures/node_launch_retry.yaml new file mode 100644 index 000000000..eee985ed2 --- /dev/null +++ b/nodepool/tests/fixtures/node_launch_retry.yaml @@ -0,0 +1,55 @@ +elements-dir: . +images-dir: '{images_dir}' + +cron: + check: '*/15 * * * *' + cleanup: '*/1 * * * *' + +zookeeper-servers: + - host: {zookeeper_host} + port: {zookeeper_port} + chroot: {zookeeper_chroot} + +labels: + - name: fake-label + image: fake-image + min-ready: 0 + providers: + - name: fake-provider + +providers: + - name: fake-provider + region-name: fake-region + keypair: 'if-present-use-this-keypair' + username: 'fake' + password: 'fake' + auth-url: 'fake' + project-id: 'fake' + max-servers: 96 + pool: 'fake' + launch-retries: 2 + networks: + - net-id: 'some-uuid' + rate: 0.0001 + images: + - name: fake-image + min-ram: 8192 + name-filter: 'Fake' + meta: + key: value + key2: value + +targets: + - name: fake-target + +diskimages: + - name: fake-image + elements: + - fedora + - vm + release: 21 + env-vars: + TMPDIR: /opt/dib_tmp + DIB_IMAGE_CACHE: /opt/dib_cache + DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/ + BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2 diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index f2e88e36c..74732890a 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -271,6 +271,27 @@ class TestNodepool(tests.DBTestCase): self.assertEqual('fake-provider', new_nodes[0].provider) self.assertNotEqual(nodes[0], new_nodes[0]) + @mock.patch('nodepool.provider_manager.FakeProviderManager.createServer') + def test_node_launch_retries(self, mock_create_server): + mock_create_server.side_effect = Exception('Boom!') + + configfile = self.setup_config('node_launch_retry.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + self._useBuilder(configfile) + pool.start() + self.waitForImage('fake-provider', 'fake-image') + + req = zk.NodeRequest() + req.state = zk.REQUESTED + req.node_types.append('fake-label') + self.zk.storeNodeRequest(req) + + req = self.waitForNodeRequest(req) + self.assertEqual(req.state, zk.FAILED) + + # retries in config is set to 2, so 2 attempts to create a server + self.assertEqual(2, mock_create_server.call_count) + @skip("Disabled for early v3 development") def test_node_delete_failure(self): def fail_delete(self, name): From 3f42a89df9af528daabaff6e4f827f0f3cb5afac Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Tue, 21 Feb 2017 12:59:53 -0500 Subject: [PATCH 074/309] Support launch failures in FakeProviderManager Let's not use mock for testing launch failures. Instead, add an attribute to FakeProviderManager that tells it how many times successive calls to createServer() should fail. Change-Id: Iba6f8f89de84b06d2c858b0ee69bc65c37ef3cf0 --- nodepool/provider_manager.py | 7 +++++++ nodepool/tests/test_nodepool.py | 23 +++++++++++------------ 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/nodepool/provider_manager.py b/nodepool/provider_manager.py index 6dc887fde..0206d720f 100644 --- a/nodepool/provider_manager.py +++ b/nodepool/provider_manager.py @@ -362,8 +362,15 @@ class ProviderManager(object): class FakeProviderManager(ProviderManager): def __init__(self, provider, use_taskmanager): + self.createServer_fails = 0 self.__client = fakeprovider.FakeOpenStackCloud() super(FakeProviderManager, self).__init__(provider, use_taskmanager) def _getClient(self): return self.__client + + def createServer(self, *args, **kwargs): + while self.createServer_fails: + self.createServer_fails -= 1 + raise Exception("Expected createServer exception") + return super(FakeProviderManager, self).createServer(*args, **kwargs) diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index 74732890a..59db57922 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -15,7 +15,6 @@ import json import logging -import mock import time from unittest import skip @@ -66,19 +65,19 @@ class TestNodepool(tests.DBTestCase): self.zk.lockNode(node, blocking=False) self.zk.unlockNode(node) - @mock.patch('nodepool.nodepool.NodeLauncher._launchNode') - def test_fail_request_on_launch_failure(self, mock_launch): + def test_fail_request_on_launch_failure(self): ''' Test that provider launch error fails the request. ''' - mock_launch.side_effect = Exception() - - configfile = self.setup_config('node.yaml') + configfile = self.setup_config('node_launch_retry.yaml') self._useBuilder(configfile) self.waitForImage('fake-provider', 'fake-image') pool = self.useNodepool(configfile, watermark_sleep=1) pool.start() + self.wait_for_config(pool) + manager = pool.getProviderManager('fake-provider') + manager.createServer_fails = 2 req = zk.NodeRequest() req.state = zk.REQUESTED @@ -86,7 +85,7 @@ class TestNodepool(tests.DBTestCase): self.zk.storeNodeRequest(req) req = self.waitForNodeRequest(req) - self.assertTrue(mock_launch.called) + self.assertEqual(0, manager.createServer_fails) self.assertEqual(req.state, zk.FAILED) self.assertNotEqual(req.declined_by, []) @@ -271,14 +270,14 @@ class TestNodepool(tests.DBTestCase): self.assertEqual('fake-provider', new_nodes[0].provider) self.assertNotEqual(nodes[0], new_nodes[0]) - @mock.patch('nodepool.provider_manager.FakeProviderManager.createServer') - def test_node_launch_retries(self, mock_create_server): - mock_create_server.side_effect = Exception('Boom!') - + def test_node_launch_retries(self): configfile = self.setup_config('node_launch_retry.yaml') pool = self.useNodepool(configfile, watermark_sleep=1) self._useBuilder(configfile) pool.start() + self.wait_for_config(pool) + manager = pool.getProviderManager('fake-provider') + manager.createServer_fails = 2 self.waitForImage('fake-provider', 'fake-image') req = zk.NodeRequest() @@ -290,7 +289,7 @@ class TestNodepool(tests.DBTestCase): self.assertEqual(req.state, zk.FAILED) # retries in config is set to 2, so 2 attempts to create a server - self.assertEqual(2, mock_create_server.call_count) + self.assertEqual(0, manager.createServer_fails) @skip("Disabled for early v3 development") def test_node_delete_failure(self): From b679863e399bd60608e654d12581147af40511ec Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 22 Feb 2017 17:34:19 -0500 Subject: [PATCH 075/309] Add some lock debug lines and an exception handler Change-Id: Ieb0d71d42fefe36af2c995162bdd007873ef3772 --- nodepool/nodepool.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index c55f130ad..6671d907e 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -871,6 +871,8 @@ class NodeRequestHandler(object): self.zk.unlockNode(node) except Exception: self.log.exception("Error unlocking node:") + self.log.debug("Unlocked node %s for request %s", + node.id, self.request.id) def _run(self): ''' @@ -886,9 +888,13 @@ class NodeRequestHandler(object): expected failure from the underlying library, which is ok for now. ''' if not self._imagesAvailable() or self._wouldExceedQuota(): + self.log.debug("Declining node request %s", + self.request.id) self.request.declined_by.append(self.launcher_id) launchers = set(self.zk.getRegisteredLaunchers()) if launchers.issubset(set(self.request.declined_by)): + self.log.debug("Failing declined node request %s", + self.request.id) # All launchers have declined it self.request.state = zk.FAILED self.zk.storeNodeRequest(self.request) @@ -896,6 +902,7 @@ class NodeRequestHandler(object): self.done = True return + self.log.debug("Accepting node request %s", self.request.id) self.request.state = zk.PENDING self.zk.storeNodeRequest(self.request) @@ -915,6 +922,9 @@ class NodeRequestHandler(object): # It's already locked so skip it. continue else: + self.log.debug( + "Locked existing node %s for request %s", + node.id, self.request.id) got_a_node = True node.allocated_to = self.request.id self.zk.storeNode(node) @@ -935,6 +945,8 @@ class NodeRequestHandler(object): # locked anywhere. self.zk.storeNode(node) self.zk.lockNode(node, blocking=False) + self.log.debug("Locked building node %s for request %s", + node.id, self.request.id) # Set state AFTER lock so sthat it isn't accidentally cleaned # up (unlocked BUILDING nodes will be deleted). @@ -990,6 +1002,8 @@ class NodeRequestHandler(object): launchers = set(self.zk.getRegisteredLaunchers()) if launchers.issubset(set(self.request.declined_by)): # All launchers have declined it + self.log.debug("Failing declined node request %s", + self.request.id) self.request.state = zk.FAILED else: self.request.state = zk.REQUESTED @@ -998,6 +1012,8 @@ class NodeRequestHandler(object): for node in self.nodeset: # Record node ID in the request self.request.nodes.append(node.id) + self.log.debug("Fulfilled node request %s", + self.request.id) self.request.state = zk.FULFILLED self._unlockNodeSet() @@ -1134,8 +1150,11 @@ class ProviderWorker(threading.Thread): # Make sure we're always registered with ZK self.zk.registerLauncher(self.launcher_id) - self._assignHandlers() - self._removeCompletedHandlers() + try: + self._assignHandlers() + self._removeCompletedHandlers() + except Exception: + self.log.exception("Error in ProviderWorker:") time.sleep(self.watermark_sleep) def stop(self): From eccebb1de869910b3ed9e6e897aed802ff98bf43 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 23 Feb 2017 10:53:37 -0500 Subject: [PATCH 076/309] Fix fulfilled log line This line was over-indented. Also, add a log entry for a declined code path that was missed. Change-Id: Id9e3b47e86d5bd89332aa16eeb07decc528cc3db --- nodepool/nodepool.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 6671d907e..cd3982a5b 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -998,6 +998,8 @@ class NodeRequestHandler(object): return True if self.launch_manager.failed_nodes: + self.log.debug("Declining node request %s", + self.request.id) self.request.declined_by.append(self.launcher_id) launchers = set(self.zk.getRegisteredLaunchers()) if launchers.issubset(set(self.request.declined_by)): @@ -1012,8 +1014,8 @@ class NodeRequestHandler(object): for node in self.nodeset: # Record node ID in the request self.request.nodes.append(node.id) - self.log.debug("Fulfilled node request %s", - self.request.id) + self.log.debug("Fulfilled node request %s", + self.request.id) self.request.state = zk.FULFILLED self._unlockNodeSet() From 599d71bad25d097ae080517f8a79a3d3238ce264 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 23 Feb 2017 14:56:45 -0500 Subject: [PATCH 077/309] Add reasons to decline log entries Also log the full information for a node request once when it is assigned. Change-Id: I81081e75d849b7e8547f336084e2e518b020eb55 --- nodepool/nodepool.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 441e003a5..d42268c0b 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -887,9 +887,14 @@ class NodeRequestHandler(object): launcher has already started doing so. This would cause an expected failure from the underlying library, which is ok for now. ''' - if not self._imagesAvailable() or self._wouldExceedQuota(): - self.log.debug("Declining node request %s", - self.request.id) + declined_reasons = [] + if not self._imagesAvailable(): + declined_reasons.append('images are not available') + if self._wouldExceedQuota(): + declined_reasons.append('it would exceed quota') + if declined_reasons: + self.log.debug("Declining node request %s because %s", + self.request.id, ', '.join(declined_reasons)) self.request.declined_by.append(self.launcher_id) launchers = set(self.zk.getRegisteredLaunchers()) if launchers.issubset(set(self.request.declined_by)): @@ -998,7 +1003,7 @@ class NodeRequestHandler(object): return True if self.launch_manager.failed_nodes: - self.log.debug("Declining node request %s", + self.log.debug("Declining node request %s because nodes failed", self.request.id) self.request.declined_by.append(self.launcher_id) launchers = set(self.zk.getRegisteredLaunchers()) @@ -1109,7 +1114,7 @@ class ProviderWorker(threading.Thread): continue # Got a lock, so assign it - self.log.info("Assigning node request %s" % req.id) + self.log.info("Assigning node request %s" % req) rh = NodeRequestHandler(self, req) rh.run() self.request_handlers.append(rh) From dd36a8588a205fb9b128c29acd04f0fb6e7126e6 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Mon, 20 Feb 2017 14:33:09 -0500 Subject: [PATCH 078/309] Cleanup node request locks Node requests are created in /nodepool/requests. The request locks are created in /nodepool/requests-lock. When a node request is deleted, or if it simply disappears (it is an ephemeral znode, so it could just go away), the lock will still remain. Add code to delete locks where there is no equivalent request. Change-Id: I7c442bf13f0995e0361745a5817a3e244947407b --- nodepool/nodepool.py | 16 ++++++++++++++++ nodepool/tests/__init__.py | 11 +++++++++++ nodepool/tests/test_nodepool.py | 7 +++++++ nodepool/tests/test_zk.py | 22 ++++++++++++++++++++++ nodepool/zk.py | 22 ++++++++++++++++++++++ 5 files changed, 78 insertions(+) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index d42268c0b..12fe199c7 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -1184,6 +1184,21 @@ class NodeCleanupWorker(threading.Thread): self._interval = interval self._running = False + def _cleanupNodeRequestLocks(self): + ''' + Remove request locks where the request no longer exists. + + Because the node request locks are not direct children of the request + znode, we need to remove the locks separately after the request has + been processed. + ''' + zk = self._nodepool.getZK() + requests = zk.getNodeRequests() + locks = zk.getNodeRequestLocks() + locks_without_requests = set(locks) - set(requests) + for lock_id in locks_without_requests: + zk.deleteNodeRequestLock(lock_id) + def _deleteInstance(self, node): ''' Delete an instance from a provider. @@ -1235,6 +1250,7 @@ class NodeCleanupWorker(threading.Thread): while self._running: try: + self._cleanupNodeRequestLocks() self._cleanupNodes() except Exception: self.log.exception("Exception in NodeCleanupWorker:") diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py index 84deaf5e3..a24657578 100644 --- a/nodepool/tests/__init__.py +++ b/nodepool/tests/__init__.py @@ -433,6 +433,17 @@ class DBTestCase(BaseTestCase): break time.sleep(1) + def waitForNodeRequestLockDeletion(self, request_id): + while True: + exists = False + for lock_id in self.zk.getNodeRequestLocks(): + if request_id == lock_id: + exists = True + break + if not exists: + break + time.sleep(1) + def waitForNodes(self, label, count=1): while True: self.wait_for_threads() diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index 59db57922..42f853aaf 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -65,6 +65,13 @@ class TestNodepool(tests.DBTestCase): self.zk.lockNode(node, blocking=False) self.zk.unlockNode(node) + # Verify the cleanup thread removed the lock + self.assertIsNotNone( + self.zk.client.exists(self.zk._requestLockPath(req.id)) + ) + self.zk.deleteNodeRequest(req) + self.waitForNodeRequestLockDeletion(req.id) + def test_fail_request_on_launch_failure(self): ''' Test that provider launch error fails the request. diff --git a/nodepool/tests/test_zk.py b/nodepool/tests/test_zk.py index e23de4f4b..52907607b 100644 --- a/nodepool/tests/test_zk.py +++ b/nodepool/tests/test_zk.py @@ -585,6 +585,28 @@ class TestZooKeeper(tests.DBTestCase): with testtools.ExpectedException(StopIteration): i.next() + def test_getNodeRequestLocks(self): + req = self._create_node_request() + self.zk.lockNodeRequest(req, blocking=False) + locks = self.zk.getNodeRequestLocks() + self.assertEqual(1, len(locks)) + self.assertEqual(req.id, locks[0]) + self.zk.unlockNodeRequest(req) + self.zk.deleteNodeRequest(req) + + def test_deleteNodeRequestLock(self): + req = self._create_node_request() + self.zk.lockNodeRequest(req, blocking=False) + self.zk.unlockNodeRequest(req) + self.zk.deleteNodeRequest(req) + + # We expect the lock to linger even after the request is deleted + locks = self.zk.getNodeRequestLocks() + self.assertEqual(1, len(locks)) + self.assertEqual(req.id, locks[0]) + self.zk.deleteNodeRequestLock(locks[0]) + self.assertEqual([], self.zk.getNodeRequestLocks()) + class TestZKModel(tests.BaseTestCase): diff --git a/nodepool/zk.py b/nodepool/zk.py index 440d60266..dde4225c9 100644 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -1240,6 +1240,28 @@ class ZooKeeper(object): return sorted(requests) + def getNodeRequestLocks(self): + ''' + Get the current list of all node request locks. + ''' + try: + locks = self.client.get_children(self.REQUEST_LOCK_ROOT) + except kze.NoNodeError: + return [] + return locks + + def deleteNodeRequestLock(self, lock): + ''' + Delete the znode for a node request lock. + + :param str lock: The lock ID. + ''' + path = self._requestLockPath(lock) + try: + self.client.delete(path) + except kze.NoNodeError: + pass + def getNodeRequest(self, request): ''' Get the data for a specific node request. From 8625185072d12978c6a98b6758e23a26d3d61e8f Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Fri, 24 Feb 2017 09:32:02 -0500 Subject: [PATCH 079/309] Delete node request locks after a delay It's possible the lock cleanup could run during the scenario where a lock is currently held for request processing, but the request has disappeared before the request handler has noticed it. Only delete locks after they've been around for at least 8 hours. Change-Id: I0f004116c67e9152160e9c193d75a8c944ef109d --- nodepool/nodepool.py | 17 +++++++---- nodepool/tests/test_nodepool.py | 1 + nodepool/tests/test_zk.py | 19 ++++++++++++ nodepool/zk.py | 51 +++++++++++++++++++++++++++++++++ 4 files changed, 83 insertions(+), 5 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 12fe199c7..59368a930 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -45,6 +45,7 @@ WATERMARK_SLEEP = 10 # Interval between checking if new servers needed IMAGE_TIMEOUT = 6 * HOURS # How long to wait for an image save CONNECT_TIMEOUT = 10 * MINS # How long to try to connect after a server # is ACTIVE +LOCK_CLEANUP = 8 * HOURS # When to delete node request lock znodes NODE_CLEANUP = 8 * HOURS # When to start deleting a node that is not # READY or HOLD TEST_CLEANUP = 5 * MINS # When to start deleting a node that is in TEST @@ -1190,14 +1191,20 @@ class NodeCleanupWorker(threading.Thread): Because the node request locks are not direct children of the request znode, we need to remove the locks separately after the request has - been processed. + been processed. Only remove them after LOCK_CLEANUP seconds have + passed. This helps prevent the scenario where a request could go + away _while_ a lock is currently held for processing and the cleanup + thread attempts to delete it. The delay should reduce the chance that + we delete a currently held lock. ''' zk = self._nodepool.getZK() requests = zk.getNodeRequests() - locks = zk.getNodeRequestLocks() - locks_without_requests = set(locks) - set(requests) - for lock_id in locks_without_requests: - zk.deleteNodeRequestLock(lock_id) + now = time.time() + for lock in zk.nodeRequestLockIterator(): + if lock.id in requests: + continue + if (now - lock.stat.mtime/1000) > LOCK_CLEANUP: + zk.deleteNodeRequestLock(lock.id) def _deleteInstance(self, node): ''' diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index 42f853aaf..04e36f3aa 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -39,6 +39,7 @@ class TestNodepool(tests.DBTestCase): self._useBuilder(configfile) image = self.waitForImage('fake-provider', 'fake-image') + nodepool.nodepool.LOCK_CLEANUP = 1 pool = self.useNodepool(configfile, watermark_sleep=1) pool.start() diff --git a/nodepool/tests/test_zk.py b/nodepool/tests/test_zk.py index 52907607b..0e254efa6 100644 --- a/nodepool/tests/test_zk.py +++ b/nodepool/tests/test_zk.py @@ -594,6 +594,25 @@ class TestZooKeeper(tests.DBTestCase): self.zk.unlockNodeRequest(req) self.zk.deleteNodeRequest(req) + def test_getNodeRequestLock(self): + req = self._create_node_request() + self.zk.lockNodeRequest(req, blocking=False) + lock = self.zk.getNodeRequestLock(req.id) + self.assertEqual(lock.id, req.id) + self.assertIsNotNone(lock.stat) + self.zk.unlockNodeRequest(req) + self.zk.deleteNodeRequest(req) + + def test_nodeRequestLockIterator(self): + req = self._create_node_request() + self.zk.lockNodeRequest(req, blocking=False) + i = self.zk.nodeRequestLockIterator() + self.assertEqual(zk.NodeRequestLock(req.id), i.next()) + with testtools.ExpectedException(StopIteration): + i.next() + self.zk.unlockNodeRequest(req) + self.zk.deleteNodeRequest(req) + def test_deleteNodeRequestLock(self): req = self._create_node_request() self.zk.lockNodeRequest(req, blocking=False) diff --git a/nodepool/zk.py b/nodepool/zk.py index dde4225c9..cad3a71cb 100644 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -308,6 +308,27 @@ class ImageUpload(BaseModel): return o +class NodeRequestLock(object): + ''' + Class representing a node request lock. + + This doesn't need to derive from BaseModel since this class exists only + to associate the znode stats with the lock. + ''' + def __init__(self, id=None): + self.id = id + self.stat = None + + def __eq__(self, other): + if isinstance(other, NodeRequestLock): + return (self.id == other.id) + else: + return False + + def __repr__(self): + return '' % self.id + + class NodeRequest(BaseModel): ''' Class representing a node request. @@ -1250,6 +1271,27 @@ class ZooKeeper(object): return [] return locks + def getNodeRequestLock(self, lock): + ''' + Get the data for a specific node request lock. + + Note that there is no user data set on a node request lock znode. The + main purpose for this method is to get the ZK stat data for the lock + so we can inspect it and use it for lock deletion. + + :param str lock: The node request lock ID. + + :returns: A NodeRequestLock object. + ''' + path = self._requestLockPath(lock) + try: + data, stat = self.client.get(path) + except kze.NoNodeError: + return None + d = NodeRequestLock(lock) + d.stat = stat + return d + def deleteNodeRequestLock(self, lock): ''' Delete the znode for a node request lock. @@ -1518,3 +1560,12 @@ class ZooKeeper(object): node = self.getNode(node_id) if node: yield node + + def nodeRequestLockIterator(self): + ''' + Utility generator method for iterating through all nodes request locks. + ''' + for lock_id in self.getNodeRequestLocks(): + lock = self.getNodeRequestLock(lock_id) + if lock: + yield lock From e5f5840c272d75eea38367bf5eb3f8d9f42e57e5 Mon Sep 17 00:00:00 2001 From: Monty Taylor Date: Wed, 1 Mar 2017 10:17:20 -0600 Subject: [PATCH 080/309] Use devstack's zookeeper support devstack grew the ability to install and configure zookeeper in support of DLM and tooz. Go ahead and use it. Change-Id: I0436abf286acc1e77d96b4df1624b8d3435ec4be --- devstack/files/debs/nodepool | 1 - devstack/settings | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/devstack/files/debs/nodepool b/devstack/files/debs/nodepool index fe8b87512..ccba43056 100644 --- a/devstack/files/debs/nodepool +++ b/devstack/files/debs/nodepool @@ -2,4 +2,3 @@ qemu-utils kpartx debootstrap yum-utils -zookeeperd diff --git a/devstack/settings b/devstack/settings index d42263511..50a7faf83 100644 --- a/devstack/settings +++ b/devstack/settings @@ -21,6 +21,7 @@ DISKIMAGE_BUILDER_REPO_REF=${DISKIMAGE_BUILDER_REPO_REF:-master} GLEAN_REPO_URL=${GLEAN_REPO_URL:-https://git.openstack.org/openstack/glean} GLEAN_REPO_REF=${GLEAN_REPO_REF:-master} +enable_service zookeeper enable_service geard enable_service statsd enable_service nodepool From 94cb8b7efb96b173630707ea2e81486d4b55dc35 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Thu, 2 Mar 2017 14:36:37 -0500 Subject: [PATCH 081/309] Correct the quota/availability algorithm Requests that try to request more than the total quota will be flat out rejected. Requests that cannot be satisified because of node availability will cause the launcher for the provider satisifying the request to pause handling new requests until the current request can be satisfied. Change-Id: Id04e5182a65f2485ee96caa8cb74b5e95a4ea6a1 --- nodepool/nodepool.py | 83 ++++++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 37 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 59368a930..d07c4c613 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -850,17 +850,6 @@ class NodeRequestHandler(object): count += 1 return count - def _wouldExceedQuota(self): - ''' - Determines if request would exceed provider quota. - - :returns: True if quota would be exceeded, False otherwise. - ''' - provider_max = self.provider.max_servers - num_requested = len(self.request.node_types) - num_in_use = self._countNodes() - return num_requested + num_in_use > provider_max - def _unlockNodeSet(self): ''' Attempt unlocking all Nodes in the object node set. @@ -875,9 +864,12 @@ class NodeRequestHandler(object): self.log.debug("Unlocked node %s for request %s", node.id, self.request.id) - def _run(self): + def _waitForNodeSet(self): ''' - Main body for the NodeRequestHandler. + Fill node set for the request. + + Obtain nodes for the request, pausing all new request handling for + this provider until the node set can be filled. note:: This code is a bit racey in its calculation of the number of nodes in use for quota purposes. It is possible for multiple @@ -888,30 +880,6 @@ class NodeRequestHandler(object): launcher has already started doing so. This would cause an expected failure from the underlying library, which is ok for now. ''' - declined_reasons = [] - if not self._imagesAvailable(): - declined_reasons.append('images are not available') - if self._wouldExceedQuota(): - declined_reasons.append('it would exceed quota') - if declined_reasons: - self.log.debug("Declining node request %s because %s", - self.request.id, ', '.join(declined_reasons)) - self.request.declined_by.append(self.launcher_id) - launchers = set(self.zk.getRegisteredLaunchers()) - if launchers.issubset(set(self.request.declined_by)): - self.log.debug("Failing declined node request %s", - self.request.id) - # All launchers have declined it - self.request.state = zk.FAILED - self.zk.storeNodeRequest(self.request) - self.zk.unlockNodeRequest(self.request) - self.done = True - return - - self.log.debug("Accepting node request %s", self.request.id) - self.request.state = zk.PENDING - self.zk.storeNodeRequest(self.request) - self.launch_manager = NodeLaunchManager( self.zk, self.provider, self.labels, self.manager, retries=self.provider.launch_retries) @@ -939,6 +907,18 @@ class NodeRequestHandler(object): # Could not grab an existing node, so launch a new one. if not got_a_node: + logged = False + + # If we calculate that we're at capacity, pause until nodes + # are released by Zuul and removed by the NodeCleanupWorker. + while self._countNodes() >= self.provider.max_servers: + if not logged: + self.log.debug( + "Pausing request handling to satisfy request %s", + self.request) + logged = True + time.sleep(1) + node = zk.Node() node.state = zk.INIT node.type = ntype @@ -963,6 +943,35 @@ class NodeRequestHandler(object): # launches. self.launch_manager.launch(node) + def _run(self): + ''' + Main body for the NodeRequestHandler. + ''' + declined_reasons = [] + if not self._imagesAvailable(): + declined_reasons.append('images are not available') + if len(self.request.node_types) > self.provider.max_servers: + declined_reasons.append('it would exceed quota') + if declined_reasons: + self.log.debug("Declining node request %s because %s", + self.request.id, ', '.join(declined_reasons)) + self.request.declined_by.append(self.launcher_id) + launchers = set(self.zk.getRegisteredLaunchers()) + if launchers.issubset(set(self.request.declined_by)): + self.log.debug("Failing declined node request %s", + self.request.id) + # All launchers have declined it + self.request.state = zk.FAILED + self.zk.storeNodeRequest(self.request) + self.zk.unlockNodeRequest(self.request) + self.done = True + return + + self.log.debug("Accepting node request %s", self.request.id) + self.request.state = zk.PENDING + self.zk.storeNodeRequest(self.request) + self._waitForNodeSet() + @property def alive_thread_count(self): return self.launch_manager.alive_thread_count From 96c8ba1781f9abe68d3cec262292534828295667 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Fri, 3 Mar 2017 13:15:28 -0500 Subject: [PATCH 082/309] Only use ready nodes from the same provider. When filling out a node set for a request, we should not consider READY nodes that are not from the provider being used to satisfy the request. Change-Id: I63c79c920133753be0fc2c3061bfe4131fc1c7cb --- nodepool/nodepool.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index d07c4c613..1267c9669 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -890,6 +890,10 @@ class NodeRequestHandler(object): got_a_node = False if self.request.reuse and ntype in ready_nodes: for node in ready_nodes[ntype]: + # Only interested in nodes from this provider + if node.provider != self.provider.name: + continue + try: self.zk.lockNode(node, blocking=False) except exceptions.ZKLockException: From ad92ea629ffad4a653c549c657171090c2721d11 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Fri, 3 Mar 2017 15:48:45 -0500 Subject: [PATCH 083/309] Group nodes by availability zone An AZ is chosen at random when selecting a node set for a node request. If the AZs are not listed within the config, then we cannot guarantee AZ grouping. To avoid selecting an AZ that may not match any nodes within our READY pool (because we ALWAYS want to use READY nodes when they're available), we use the AZ from the first READY node for the random AZ choice. If we bypass the READY pool (e.g, min-ready requests), or if the READY pool is depleted, then we randomly select an AZ from the config. Change-Id: I97bdda2c4b6952f1cea18f72927ed1e8ccff9787 --- nodepool/fakeprovider.py | 2 ++ nodepool/nodepool.py | 36 +++++++++++++++++++++++++++---- nodepool/tests/fixtures/node.yaml | 2 ++ nodepool/tests/test_nodepool.py | 1 + 4 files changed, 37 insertions(+), 4 deletions(-) diff --git a/nodepool/fakeprovider.py b/nodepool/fakeprovider.py index 2ee40b3a7..5feafe135 100644 --- a/nodepool/fakeprovider.py +++ b/nodepool/fakeprovider.py @@ -31,6 +31,7 @@ class Dummy(object): INSTANCE = 'Instance' FLAVOR = 'Flavor' KEYPAIR = 'Keypair' + LOCATION = 'Server.Location' def __init__(self, kind, **kw): self.__kind = kind @@ -146,6 +147,7 @@ class FakeOpenStackCloud(object): public_v4=public_v4, public_v6=public_v6, private_v4=private_v4, + location=Dummy(Dummy.LOCATION, zone=kw.get('az')), metadata=kw.get('meta', {}), manager=self, key_name=kw.get('key_name', None), diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 1267c9669..2fb116b94 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -561,6 +561,10 @@ class NodeLauncher(threading.Thread): nodepool_node_id=self._node.id, nodepool_image_name=config_image.name) + # If we didn't specify an AZ, set it to the one chosen by Nova. + if not self._node.az: + self._node.az = server.location.zone + self._node.external_id = server.id self._node.hostname = hostname self._node.image_id = "{path}/{upload_id}".format( @@ -609,11 +613,11 @@ class NodeLauncher(threading.Thread): # Checkpoint save the updated node info self._zk.storeNode(self._node) - self.log.debug("Node id: %s is running, ipv4: %s, ipv6: %s" % - (self._node.id, self._node.public_ipv4, + self.log.debug("Node %s is running [az: %s, ipv4: %s, ipv6: %s]" % + (self._node.id, self._node.az, self._node.public_ipv4, self._node.public_ipv6)) - self.log.debug("Node id: %s testing ssh at ip: %s" % + self.log.debug("Node %s testing ssh at ip: %s" % (self._node.id, preferred_ip)) host = utils.ssh_connect( preferred_ip, config_image.username, @@ -871,6 +875,14 @@ class NodeRequestHandler(object): Obtain nodes for the request, pausing all new request handling for this provider until the node set can be filled. + We attempt to group the node set within the same provider availability + zone. For this to work properly, the provider entry in the nodepool + config must list the availability zones. Otherwise, new nodes will be + put in random AZs at nova's whim. The exception being if there is an + existing node in the READY state that we can select for this node set. + Its AZ will then be used for new nodes, as well as any other READY + nodes. + note:: This code is a bit racey in its calculation of the number of nodes in use for quota purposes. It is possible for multiple launchers to be doing this calculation at the same time. Since we @@ -885,14 +897,19 @@ class NodeRequestHandler(object): retries=self.provider.launch_retries) ready_nodes = self.zk.getReadyNodesOfTypes(self.request.node_types) + chosen_az = None + for ntype in self.request.node_types: # First try to grab from the list of already available nodes. got_a_node = False if self.request.reuse and ntype in ready_nodes: for node in ready_nodes[ntype]: - # Only interested in nodes from this provider + # Only interested in nodes from this provider and within + # the selected AZ. if node.provider != self.provider.name: continue + if chosen_az and node.az != chosen_az: + continue try: self.zk.lockNode(node, blocking=False) @@ -907,10 +924,20 @@ class NodeRequestHandler(object): node.allocated_to = self.request.id self.zk.storeNode(node) self.nodeset.append(node) + + # AZ from this ready node. This will cause new nodes + # to share this AZ, as well. + if not chosen_az and node.az: + chosen_az = node.az break # Could not grab an existing node, so launch a new one. if not got_a_node: + # Select grouping AZ if we didn't set AZ from a selected, + # pre-existing node + if not chosen_az and self.provider.azs: + chosen_az = random.choice(self.provider.azs) + logged = False # If we calculate that we're at capacity, pause until nodes @@ -927,6 +954,7 @@ class NodeRequestHandler(object): node.state = zk.INIT node.type = ntype node.provider = self.provider.name + node.az = chosen_az node.launcher = self.launcher_id node.allocated_to = self.request.id diff --git a/nodepool/tests/fixtures/node.yaml b/nodepool/tests/fixtures/node.yaml index 7e62d53c7..3406c61e8 100644 --- a/nodepool/tests/fixtures/node.yaml +++ b/nodepool/tests/fixtures/node.yaml @@ -20,6 +20,8 @@ labels: providers: - name: fake-provider region-name: fake-region + availability-zones: + - az1 keypair: 'if-present-use-this-keypair' username: 'fake' password: 'fake' diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index 04e36f3aa..1712edf5a 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -57,6 +57,7 @@ class TestNodepool(tests.DBTestCase): self.assertEqual(node.allocated_to, req.id) self.assertEqual(node.state, zk.READY) self.assertIsNotNone(node.launcher) + self.assertEqual(node.az, "az1") p = "{path}/{id}".format( path=self.zk._imageUploadPath(image.image_name, image.build_id, From 4a8803ddb2b0dd0492132a688009bf03b0262a6c Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Mon, 6 Mar 2017 11:43:32 -0500 Subject: [PATCH 084/309] Fix comment about AZ selection Accidentally removed a comment line in a previous change. This adds it back (correcting it, slightly). Change-Id: Ia09694d482748d9d0ca7e4037307a7f0fe24470c --- nodepool/nodepool.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 2fb116b94..de8556f25 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -925,6 +925,7 @@ class NodeRequestHandler(object): self.zk.storeNode(node) self.nodeset.append(node) + # If we haven't already chosen an AZ, select the # AZ from this ready node. This will cause new nodes # to share this AZ, as well. if not chosen_az and node.az: From 200e5b1e0959bb8f206d7fe8da9036b7be83e78b Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Mon, 6 Mar 2017 12:03:21 -0500 Subject: [PATCH 085/309] Suspend NodeCleanupWorker on ZK suspension Similar to other threads, we need to wait for ZK connections to return. Change-Id: Id3cb828290bb80422d2a665dcfbc920982e3c662 --- nodepool/nodepool.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index de8556f25..3aebccaed 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -1298,6 +1298,12 @@ class NodeCleanupWorker(threading.Thread): self._running = True while self._running: + # Don't do work if we've lost communication with the ZK cluster + zk_conn = self._nodepool.getZK() + while zk_conn and (zk_conn.suspended or zk_conn.lost): + self.log.info("ZooKeeper suspended. Waiting") + time.sleep(SUSPEND_WAIT_TIME) + try: self._cleanupNodeRequestLocks() self._cleanupNodes() From cdc28ca3699c9048d743bcd15a018e97b83ad28f Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Wed, 8 Mar 2017 09:30:36 -0500 Subject: [PATCH 086/309] Add 'requestor' to NodeRequest model Zuul adds this attribute to the ZK data. Nodepool will need it when reporting launch statistics. Change-Id: Ibfec696a20b8a2cf610ed3599afbe670d8986ccc --- nodepool/tests/test_zk.py | 4 ++++ nodepool/zk.py | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/nodepool/tests/test_zk.py b/nodepool/tests/test_zk.py index 0e254efa6..6168057c1 100644 --- a/nodepool/tests/test_zk.py +++ b/nodepool/tests/test_zk.py @@ -723,6 +723,7 @@ class TestZKModel(tests.BaseTestCase): o.node_types.append('trusty') o.nodes.append('100') o.reuse = False + o.requestor = 'zuul' d = o.toDict() self.assertNotIn('id', d) self.assertIn('state', d) @@ -731,6 +732,7 @@ class TestZKModel(tests.BaseTestCase): self.assertEqual(d['node_types'], o.node_types) self.assertEqual(d['nodes'], o.nodes) self.assertEqual(d['reuse'], o.reuse) + self.assertEqual(d['requestor'], o.requestor) def test_NodeRequest_fromDict(self): now = int(time.time()) @@ -742,6 +744,7 @@ class TestZKModel(tests.BaseTestCase): 'node_types': ['trusty'], 'nodes': ['100'], 'reuse': False, + 'requestor': 'zuul', } o = zk.NodeRequest.fromDict(d, req_id) @@ -752,6 +755,7 @@ class TestZKModel(tests.BaseTestCase): self.assertEqual(o.node_types, d['node_types']) self.assertEqual(o.nodes, d['nodes']) self.assertEqual(o.reuse, d['reuse']) + self.assertEqual(o.requestor, d['requestor']) def test_Node_toDict(self): o = zk.Node('123') diff --git a/nodepool/zk.py b/nodepool/zk.py index cad3a71cb..98e07f913 100644 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -342,6 +342,7 @@ class NodeRequest(BaseModel): self.node_types = [] self.nodes = [] self.reuse = True + self.requestor = None def __repr__(self): d = self.toDict() @@ -355,7 +356,8 @@ class NodeRequest(BaseModel): self.declined_by == other.declined_by and self.node_types == other.node_types and self.nodes == other.nodes, - self.reuse == other.reuse) + self.reuse == other.reuse, + self.requestor == other.requestor) else: return False @@ -368,6 +370,7 @@ class NodeRequest(BaseModel): d['node_types'] = self.node_types d['nodes'] = self.nodes d['reuse'] = self.reuse + d['requestor'] = self.requestor return d @staticmethod @@ -386,6 +389,7 @@ class NodeRequest(BaseModel): o.node_types = d.get('node_types', []) o.nodes = d.get('nodes', []) o.reuse = d.get('reuse', True) + o.requestor = d.get('requestor') return o From f616f88111bfb294660e3f9ec6296a2f8ea9b6a4 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Thu, 9 Mar 2017 07:33:17 -0500 Subject: [PATCH 087/309] Add back statsd reporting This may require tweaking. Change-Id: I5481d5855045fa53fa468fca7b189efc76001d38 --- nodepool/nodepool.py | 249 ++++++++++++----------- nodepool/tests/test_nodelaunchmanager.py | 6 +- 2 files changed, 135 insertions(+), 120 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 3aebccaed..a335b8648 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -170,12 +170,103 @@ class NodeCompleteThread(threading.Thread): self.nodepool.deleteNode(node.id) -class InstanceDeleter(threading.Thread): +class StatsReporter(object): + ''' + Class adding statsd reporting functionality. + ''' + def __init__(self): + super(StatsReporter, self).__init__() + self._statsd = stats.get_client() + + def recordLaunchStats(self, subkey, dt, image_name, + provider_name, node_az, requestor): + ''' + Record node launch statistics. + + :param str subkey: statsd key + :param int dt: Time delta in milliseconds + :param str image_name: Name of the image used + :param str provider_name: Name of the provider + :param str node_az: AZ of the launched node + :param str requestor: Identifier for the request originator + ''' + if not self._statsd: + return + + keys = [ + 'nodepool.launch.provider.%s.%s' % (provider_name, subkey), + 'nodepool.launch.image.%s.%s' % (image_name, subkey), + 'nodepool.launch.%s' % (subkey,), + ] + + if node_az: + keys.append('nodepool.launch.provider.%s.%s.%s' % + (provider_name, node_az, subkey)) + + if requestor: + keys.append('nodepool.launch.requestor.%s.%s' % + (requestor, subkey)) + + for key in keys: + self._statsd.timing(key, dt) + self._statsd.incr(key) + + + def updateNodeStats(self, zk_conn, provider): + ''' + Refresh statistics for all known nodes. + + :param ZooKeeper zk_conn: A ZooKeeper connection object. + :param Provider provider: A config Provider object. + ''' + if not self._statsd: + return + + states = {} + + # Initialize things we know about to zero + for state in zk.Node.VALID_STATES: + key = 'nodepool.nodes.%s' % state + states[key] = 0 + key = 'nodepool.provider.%s.nodes.%s' % (provider.name, state) + states[key] = 0 + + for node in zk_conn.nodeIterator(): + #nodepool.nodes.STATE + key = 'nodepool.nodes.%s' % node.state + states[key] += 1 + + #nodepool.label.LABEL.nodes.STATE + key = 'nodepool.label.%s.nodes.%s' % (node.type, node.state) + # It's possible we could see node types that aren't in our config + if key in states: + states[key] += 1 + else: + states[key] = 1 + + #nodepool.provider.PROVIDER.nodes.STATE + key = 'nodepool.provider.%s.nodes.%s' % (node.provider, node.state) + # It's possible we could see providers that aren't in our config + if key in states: + states[key] += 1 + else: + states[key] = 1 + + for key, count in states.items(): + self._statsd.gauge(key, count) + + #nodepool.provider.PROVIDER.max_servers + key = 'nodepool.provider.%s.max_servers' % provider.name + self._statsd.gauge(key, provider.max_servers) + + +class InstanceDeleter(threading.Thread, StatsReporter): log = logging.getLogger("nodepool.InstanceDeleter") def __init__(self, zk, manager, node): threading.Thread.__init__(self, name='InstanceDeleter for %s %s' % (node.provider, node.external_id)) + StatsReporter.__init__(self) self._zk = zk self._manager = manager self._node = node @@ -210,6 +301,11 @@ class InstanceDeleter(threading.Thread): def run(self): self.delete(self._zk, self._manager, self._node) + try: + self.updateNodeStats(self._zk, self._manager.provider) + except Exception: + self.log.exception("Exception while reporting stats:") + class NodeDeleter(threading.Thread): log = logging.getLogger("nodepool.NodeDeleter") @@ -509,9 +605,10 @@ class OLDNodeLauncher(threading.Thread): output=True) -class NodeLauncher(threading.Thread): +class NodeLauncher(threading.Thread, StatsReporter): - def __init__(self, zk, provider, label, provider_manager, node, retries): + def __init__(self, zk, provider, label, provider_manager, requestor, + node, retries): ''' Initialize the launcher. @@ -520,10 +617,12 @@ class NodeLauncher(threading.Thread): :param Label label: The Label object for this node type. :param ProviderManager provider_manager: The manager object used to interact with the selected provider. + :param str requestor: Identifier for the request originator. :param Node node: The node object. :param int retries: Number of times to retry failed launches. ''' threading.Thread.__init__(self, name="NodeLauncher-%s" % node.id) + StatsReporter.__init__(self) self.log = logging.getLogger("nodepool.NodeLauncher-%s" % node.id) self._zk = zk self._provider = provider @@ -531,10 +630,15 @@ class NodeLauncher(threading.Thread): self._manager = provider_manager self._node = node self._retries = retries + self._image_name = None + self._requestor = requestor def _launchNode(self): config_image = self._provider.images[self._label.image] + # Stored for statsd reporting + self._image_name = config_image.name + cloud_image = self._zk.getMostRecentImageUpload( config_image.name, self._provider.name) if not cloud_image: @@ -703,18 +807,36 @@ class NodeLauncher(threading.Thread): self.log.info("Node id %s is ready", self._node.id) def run(self): + start_time = time.time() + statsd_key = 'ready' + try: self._run() - except Exception: + except Exception as e: self._node.state = zk.FAILED self._zk.storeNode(self._node) + if hasattr(e, 'statsd_key'): + statsd_key = e.statsd_key + else: + statsd_key = 'error.unknown' + + dt = int((time.time() - start_time) * 1000) + try: + self.recordLaunchStats(statsd_key, dt, self._image_name, + self._node.provider, self._node.az, + self._requestor) + self.updateNodeStats(self._zk, self._provider) + except Exception: + self.log.exception("Exception while reporting stats:") + class NodeLaunchManager(object): ''' Handle launching multiple nodes in parallel. ''' - def __init__(self, zk, provider, labels, provider_manager, retries): + def __init__(self, zk, provider, labels, provider_manager, + requestor, retries): ''' Initialize the launch manager. @@ -723,6 +845,7 @@ class NodeLaunchManager(object): :param dict labels: A dict of config Label objects. :param ProviderManager provider_manager: The manager object used to interact with the selected provider. + :param str requestor: Identifier for the request originator. :param int retries: Number of times to retry failed launches. ''' self._retries = retries @@ -734,6 +857,7 @@ class NodeLaunchManager(object): self._provider = provider self._labels = labels self._manager = provider_manager + self._requestor = requestor @property def alive_thread_count(self): @@ -764,7 +888,7 @@ class NodeLaunchManager(object): self._nodes.append(node) label = self._labels[node.type] t = NodeLauncher(self._zk, self._provider, label, self._manager, - node, self._retries) + self._requestor, node, self._retries) t.start() self._threads.append(t) @@ -894,9 +1018,9 @@ class NodeRequestHandler(object): ''' self.launch_manager = NodeLaunchManager( self.zk, self.provider, self.labels, self.manager, - retries=self.provider.launch_retries) - ready_nodes = self.zk.getReadyNodesOfTypes(self.request.node_types) + self.request.requestor, retries=self.provider.launch_retries) + ready_nodes = self.zk.getReadyNodesOfTypes(self.request.node_types) chosen_az = None for ntype in self.request.node_types: @@ -1922,112 +2046,3 @@ class NodePool(threading.Thread): node.label_name) self.deleteNode(node.id) self.log.debug("Finished periodic check") - - def updateStats(self, session, provider_name): - if not self.statsd: - return - # This may be called outside of the main thread. - - states = {} - - #nodepool.nodes.STATE - #nodepool.target.TARGET.nodes.STATE - #nodepool.label.LABEL.nodes.STATE - #nodepool.provider.PROVIDER.nodes.STATE - for state in nodedb.STATE_NAMES.values(): - key = 'nodepool.nodes.%s' % state - states[key] = 0 - for target in self.config.targets.values(): - key = 'nodepool.target.%s.nodes.%s' % ( - target.name, state) - states[key] = 0 - for label in self.config.labels.values(): - key = 'nodepool.label.%s.nodes.%s' % ( - label.name, state) - states[key] = 0 - for provider in self.config.providers.values(): - key = 'nodepool.provider.%s.nodes.%s' % ( - provider.name, state) - states[key] = 0 - - managers = set() - - for node in session.getNodes(): - if node.state not in nodedb.STATE_NAMES: - continue - state = nodedb.STATE_NAMES[node.state] - key = 'nodepool.nodes.%s' % state - total_nodes = 1 - states[key] += total_nodes - - # NOTE(pabelanger): Check if we assign nodes via Gearman if so, use - # the manager name. - #nodepool.manager.MANAGER.nodes.STATE - if node.manager_name: - key = 'nodepool.manager.%s.nodes.%s' % ( - node.manager_name, state) - if key not in states: - states[key] = 0 - managers.add(node.manager_name) - else: - key = 'nodepool.target.%s.nodes.%s' % ( - node.target_name, state) - states[key] += total_nodes - - key = 'nodepool.label.%s.nodes.%s' % ( - node.label_name, state) - states[key] += total_nodes - - key = 'nodepool.provider.%s.nodes.%s' % ( - node.provider_name, state) - states[key] += total_nodes - - # NOTE(pabelanger): Initialize other state values to zero if missed - # above. - #nodepool.manager.MANAGER.nodes.STATE - for state in nodedb.STATE_NAMES.values(): - for manager_name in managers: - key = 'nodepool.manager.%s.nodes.%s' % ( - manager_name, state) - if key not in states: - states[key] = 0 - - for key, count in states.items(): - self.statsd.gauge(key, count) - - #nodepool.provider.PROVIDER.max_servers - for provider in self.config.providers.values(): - key = 'nodepool.provider.%s.max_servers' % provider.name - self.statsd.gauge(key, provider.max_servers) - - def launchStats(self, subkey, dt, image_name, - provider_name, target_name, node_az, manager_name): - if not self.statsd: - return - #nodepool.launch.provider.PROVIDER.subkey - #nodepool.launch.image.IMAGE.subkey - #nodepool.launch.subkey - keys = [ - 'nodepool.launch.provider.%s.%s' % (provider_name, subkey), - 'nodepool.launch.image.%s.%s' % (image_name, subkey), - 'nodepool.launch.%s' % (subkey,), - ] - if node_az: - #nodepool.launch.provider.PROVIDER.AZ.subkey - keys.append('nodepool.launch.provider.%s.%s.%s' % - (provider_name, node_az, subkey)) - - if manager_name: - # NOTE(pabelanger): Check if we assign nodes via Gearman if so, use - # the manager name. - #nodepool.launch.manager.MANAGER.subkey - keys.append('nodepool.launch.manager.%s.%s' % - (manager_name, subkey)) - else: - #nodepool.launch.target.TARGET.subkey - keys.append('nodepool.launch.target.%s.%s' % - (target_name, subkey)) - - for key in keys: - self.statsd.timing(key, dt) - self.statsd.incr(key) diff --git a/nodepool/tests/test_nodelaunchmanager.py b/nodepool/tests/test_nodelaunchmanager.py index 3236eed46..d93715899 100644 --- a/nodepool/tests/test_nodelaunchmanager.py +++ b/nodepool/tests/test_nodelaunchmanager.py @@ -54,7 +54,7 @@ class TestNodeLaunchManager(tests.DBTestCase): n1.state = zk.BUILDING n1.type = 'fake-label' mgr = NodeLaunchManager(self.zk, self.provider, self.labels, - self.pmanager, 1) + self.pmanager, 'zuul', 1) mgr.launch(n1) while not mgr.poll(): time.sleep(0) @@ -71,7 +71,7 @@ class TestNodeLaunchManager(tests.DBTestCase): n1.state = zk.BUILDING n1.type = 'fake-label' mgr = NodeLaunchManager(self.zk, self.provider, self.labels, - self.pmanager, 1) + self.pmanager, 'zuul', 1) mgr.launch(n1) while not mgr.poll(): time.sleep(0) @@ -91,7 +91,7 @@ class TestNodeLaunchManager(tests.DBTestCase): n2.state = zk.BUILDING n2.type = 'fake-label' mgr = NodeLaunchManager(self.zk, self.provider, self.labels, - self.pmanager, 1) + self.pmanager, 'zuul', 1) mgr.launch(n1) mgr.launch(n2) while not mgr.poll(): From 904d1248c0fb91b8e395a538c1d4b0454f1c37c9 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Thu, 9 Mar 2017 08:59:44 -0500 Subject: [PATCH 088/309] Remove old/dead classes Removes three classes that are either dead code or replaced by newer classes. Change-Id: I6cf96c81af1d57b11e42ffb5083c44697b82c71e --- nodepool/nodepool.py | 395 ------------------------------------------- 1 file changed, 395 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index a335b8648..23840a0c6 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -73,103 +73,6 @@ class LaunchAuthException(Exception): statsd_key = 'error.auth' -class NodeCompleteThread(threading.Thread): - log = logging.getLogger("nodepool.NodeCompleteThread") - - def __init__(self, nodepool, nodename, jobname, result, branch): - threading.Thread.__init__(self, name='NodeCompleteThread for %s' % - nodename) - self.nodename = nodename - self.nodepool = nodepool - self.jobname = jobname - self.result = result - self.branch = branch - self.statsd = stats.get_client() - - def run(self): - try: - with self.nodepool.getDB().getSession() as session: - self.handleEvent(session) - except Exception: - self.log.exception("Exception handling event for %s:" % - self.nodename) - - def handleEvent(self, session): - node = session.getNodeByNodename(self.nodename) - if not node: - self.log.debug("Unable to find node with nodename: %s" % - self.nodename) - return - - if node.state == nodedb.HOLD: - self.log.info("Node id: %s is complete but in HOLD state" % - node.id) - return - - nodepool_job = session.getJobByName(self.jobname) - if (nodepool_job and nodepool_job.hold_on_failure and - self.result != 'SUCCESS'): - held_nodes = session.getNodes(state=nodedb.HOLD) - held_nodes = [n for n in held_nodes if self.jobname in n.comment] - if len(held_nodes) >= nodepool_job.hold_on_failure: - self.log.info("Node id: %s has failed %s but %s nodes " - "are already held for that job" % ( - node.id, self.jobname, len(held_nodes))) - else: - node.state = nodedb.HOLD - node.comment = "Automatically held after failing %s" % ( - self.jobname,) - self.log.info("Node id: %s failed %s, automatically holding" % ( - node.id, self.jobname)) - self.nodepool.updateStats(session, node.provider_name) - return - - target = self.nodepool.config.targets[node.target_name] - if self.jobname == target.jenkins_test_job: - self.log.debug("Test job for node id: %s complete, result: %s" % - (node.id, self.result)) - if self.result == 'SUCCESS': - jenkins = self.nodepool.getJenkinsManager(target) - old = jenkins.relabelNode(node.nodename, [node.image_name]) - if not old: - old = '[unlabeled]' - self.log.info("Relabeled jenkins node id: %s from %s to %s" % - (node.id, old, node.image_name)) - node.state = nodedb.READY - self.log.info("Node id: %s is ready" % node.id) - self.nodepool.updateStats(session, node.provider_name) - return - self.log.info("Node id: %s failed acceptance test, deleting" % - node.id) - - if self.statsd and self.result == 'SUCCESS': - start = node.state_time - dt = int((time.time() - start) * 1000) - - # nodepool.job.tempest - key = 'nodepool.job.%s' % self.jobname - self.statsd.timing(key + '.runtime', dt) - self.statsd.incr(key + '.builds') - - # nodepool.job.tempest.master - key += '.%s' % self.branch - self.statsd.timing(key + '.runtime', dt) - self.statsd.incr(key + '.builds') - - # nodepool.job.tempest.master.devstack-precise - key += '.%s' % node.label_name - self.statsd.timing(key + '.runtime', dt) - self.statsd.incr(key + '.builds') - - # nodepool.job.tempest.master.devstack-precise.rax-ord - key += '.%s' % node.provider_name - self.statsd.timing(key + '.runtime', dt) - self.statsd.incr(key + '.builds') - - time.sleep(DELETE_DELAY) - self.nodepool.deleteNode(node.id) - - class StatsReporter(object): ''' Class adding statsd reporting functionality. @@ -307,304 +210,6 @@ class InstanceDeleter(threading.Thread, StatsReporter): self.log.exception("Exception while reporting stats:") -class NodeDeleter(threading.Thread): - log = logging.getLogger("nodepool.NodeDeleter") - - def __init__(self, nodepool, node_id): - threading.Thread.__init__(self, name='NodeDeleter for %s' % node_id) - self.node_id = node_id - self.nodepool = nodepool - - def run(self): - try: - with self.nodepool.getDB().getSession() as session: - node = session.getNode(self.node_id) - self.nodepool._deleteNode(session, node) - except Exception: - self.log.exception("Exception deleting node %s:" % - self.node_id) - - -class OLDNodeLauncher(threading.Thread): - log = logging.getLogger("nodepool.NodeLauncher") - - def __init__(self, nodepool, provider, label, target, node_id, timeout, - launch_timeout): - threading.Thread.__init__(self, name='NodeLauncher for %s' % node_id) - self.provider = provider - self.label = label - self.image = provider.images[label.image] - self.target = target - self.node_id = node_id - self.timeout = timeout - self.nodepool = nodepool - self.launch_timeout = launch_timeout - - def run(self): - try: - self._run() - except Exception: - self.log.exception("Exception in run method:") - - def _run(self): - with self.nodepool.getDB().getSession() as session: - self.log.debug("Launching node id: %s" % self.node_id) - try: - self.node = session.getNode(self.node_id) - self.manager = self.nodepool.getProviderManager(self.provider) - except Exception: - self.log.exception("Exception preparing to launch node id: %s:" - % self.node_id) - return - - try: - start_time = time.time() - dt = self.launchNode(session) - failed = False - statsd_key = 'ready' - self.log.debug('Node %s ready in provider: %s' % - (self.node_id, self.provider.name)) - except exceptions.TimeoutException as e: - # Don't log exception for timeouts. Each one has - # a specific Exception, and we know it's a timeout, so - # the traceback in the log is just noise - self.log.error("Timeout launching node id: %s " - "in provider: %s error: %s" % - (self.node_id, self.provider.name, - str(e))) - dt = int((time.time() - start_time) * 1000) - failed = True - statsd_key = e.statsd_key - except Exception as e: - self.log.exception("%s launching node id: %s " - "in provider: %s error:" % - (e.__class__.__name__, - self.node_id, self.provider.name)) - dt = int((time.time() - start_time) * 1000) - failed = True - if hasattr(e, 'statsd_key'): - statsd_key = e.statsd_key - else: - statsd_key = 'error.unknown' - - try: - - self.nodepool.launchStats(statsd_key, dt, self.image.name, - self.provider.name, - self.target.name, - self.node.az, - self.node.manager_name) - except Exception: - self.log.exception("Exception reporting launch stats:") - - if failed: - try: - self.nodepool.deleteNode(self.node_id) - except Exception: - self.log.exception("Exception deleting node id: %s:" % - self.node_id) - - def launchNode(self, session): - start_time = time.time() - timestamp = int(start_time) - - hostname = self.target.hostname.format( - label=self.label, provider=self.provider, node_id=self.node.id, - timestamp=str(timestamp)) - self.node.hostname = hostname - self.node.nodename = hostname.split('.')[0] - self.node.target_name = self.target.name - - cloud_image = self.nodepool.zk.getMostRecentImageUpload( - self.image.name, self.provider.name) - if not cloud_image: - raise LaunchNodepoolException("Unable to find current cloud" - "image %s in %s" % - (self.image.name, - self.provider.name)) - - self.log.info("Creating server with hostname %s in %s from image %s " - "for node id: %s" % (hostname, self.provider.name, - self.image.name, self.node_id)) - server = self.manager.createServer( - hostname, self.image.min_ram, cloud_image.external_id, - name_filter=self.image.name_filter, az=self.node.az, - config_drive=self.image.config_drive, - nodepool_node_id=self.node_id, - nodepool_image_name=self.image.name) - server_id = server['id'] - self.node.external_id = server_id - session.commit() - - self.log.debug("Waiting for server %s for node id: %s" % - (server_id, self.node.id)) - server = self.manager.waitForServer(server, self.launch_timeout) - if server['status'] != 'ACTIVE': - raise LaunchStatusException("Server %s for node id: %s " - "status: %s" % - (server_id, self.node.id, - server['status'])) - - ip = server.get('public_v4') - ip_v6 = server.get('public_v6') - if self.provider.ipv6_preferred: - if ip_v6: - ip = ip_v6 - else: - self.log.warning('Preferred ipv6 not available, ' - 'falling back to ipv4.') - if not ip: - self.log.debug( - "Server data for failed IP: %s" % pprint.pformat( - server)) - raise LaunchNetworkException("Unable to find public IP of server") - - self.node.ip_private = server.get('private_v4') - # devstack-gate multi-node depends on private_v4 being populated - # with something. On clouds that don't have a private address, use - # the public. - if not self.node.ip_private: - self.node.ip_private = server.get('public_v4') - self.node.ip = ip - self.log.debug("Node id: %s is running, ipv4: %s, ipv6: %s" % - (self.node.id, server.get('public_v4'), - server.get('public_v6'))) - - self.log.debug("Node id: %s testing ssh at ip: %s" % - (self.node.id, ip)) - connect_kwargs = dict(key_filename=self.image.private_key) - if not utils.ssh_connect(ip, self.image.username, - connect_kwargs=connect_kwargs, - timeout=self.timeout): - raise LaunchAuthException("Unable to connect via ssh") - - # Save the elapsed time for statsd - dt = int((time.time() - start_time) * 1000) - - nodelist = [] - nodelist.append(('primary', self.node)) - - self.writeNodepoolInfo(nodelist) - if self.label.ready_script: - self.runReadyScript(nodelist) - - # Do this before adding to jenkins to avoid a race where - # Jenkins might immediately use the node before we've updated - # the state: - if self.target.jenkins_test_job: - self.node.state = nodedb.TEST - self.log.info("Node id: %s is in testing" % self.node.id) - else: - self.node.state = nodedb.READY - self.log.info("Node id: %s is ready" % self.node.id) - self.nodepool.updateStats(session, self.provider.name) - - if self.target.jenkins_url: - self.log.debug("Adding node id: %s to jenkins" % self.node.id) - self.createJenkinsNode() - self.log.info("Node id: %s added to jenkins" % self.node.id) - - return dt - - def createJenkinsNode(self): - jenkins = self.nodepool.getJenkinsManager(self.target) - - args = dict(name=self.node.nodename, - host=self.node.ip, - description='Dynamic single use %s node' % self.label.name, - executors=1, - root=self.image.user_home) - if not self.target.jenkins_test_job: - args['labels'] = self.label.name - if self.target.jenkins_credentials_id: - args['credentials_id'] = self.target.jenkins_credentials_id - else: - args['username'] = self.image.username - args['private_key'] = self.image.private_key - - jenkins.createNode(**args) - - if self.target.jenkins_test_job: - params = dict(NODE=self.node.nodename) - jenkins.startBuild(self.target.jenkins_test_job, params) - - def writeNodepoolInfo(self, nodelist): - key = paramiko.RSAKey.generate(2048) - public_key = key.get_name() + ' ' + key.get_base64() - - for role, n in nodelist: - connect_kwargs = dict(key_filename=self.image.private_key) - host = utils.ssh_connect(n.ip, self.image.username, - connect_kwargs=connect_kwargs, - timeout=self.timeout) - if not host: - raise Exception("Unable to log in via SSH") - - host.ssh("test for config dir", "ls /etc/nodepool") - - ftp = host.client.open_sftp() - - # The Role of this node - f = ftp.open('/etc/nodepool/role', 'w') - f.write(role + '\n') - f.close() - # The IP of this node - f = ftp.open('/etc/nodepool/node', 'w') - f.write(n.ip + '\n') - f.close() - # The private IP of this node - f = ftp.open('/etc/nodepool/node_private', 'w') - f.write(n.ip_private + '\n') - f.close() - # The IP of the primary node of this node set - f = ftp.open('/etc/nodepool/primary_node', 'w') - f.write(self.node.ip + '\n') - f.close() - # The private IP of the primary node of this node set - f = ftp.open('/etc/nodepool/primary_node_private', 'w') - f.write(self.node.ip_private + '\n') - f.close() - # The SSH key for this node set - f = ftp.open('/etc/nodepool/id_rsa', 'w') - key.write_private_key(f) - f.close() - f = ftp.open('/etc/nodepool/id_rsa.pub', 'w') - f.write(public_key + '\n') - f.close() - # Provider information for this node set - f = ftp.open('/etc/nodepool/provider', 'w') - f.write('NODEPOOL_PROVIDER=%s\n' % self.provider.name) - f.write('NODEPOOL_CLOUD=%s\n' % self.provider.cloud_config.name) - f.write('NODEPOOL_REGION=%s\n' % ( - self.provider.region_name or '',)) - f.write('NODEPOOL_AZ=%s\n' % (self.node.az or '',)) - f.close() - # The instance UUID for this node - f = ftp.open('/etc/nodepool/uuid', 'w') - f.write(n.external_id + '\n') - f.close() - - ftp.close() - - def runReadyScript(self, nodelist): - for role, n in nodelist: - connect_kwargs = dict(key_filename=self.image.private_key) - host = utils.ssh_connect(n.ip, self.image.username, - connect_kwargs=connect_kwargs, - timeout=self.timeout) - if not host: - raise Exception("Unable to log in via SSH") - - env_vars = '' - for k, v in os.environ.items(): - if k.startswith('NODEPOOL_'): - env_vars += ' %s="%s"' % (k, v) - host.ssh("run ready script", - "cd /opt/nodepool-scripts && %s ./%s %s" % - (env_vars, self.label.ready_script, n.hostname), - output=True) - - class NodeLauncher(threading.Thread, StatsReporter): def __init__(self, zk, provider, label, provider_manager, requestor, From 9ff5fc9c5082e1157c398fe802854c4487b8138b Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Thu, 9 Mar 2017 10:24:30 -0500 Subject: [PATCH 089/309] Add leaked instance cleanup Servers not listed in the ZooKeeper data are deleted. Change-Id: Ic4181d1c73bbfa5520e52be476ea74a436d1d8cf --- nodepool/nodepool.py | 124 +++++++++++++++++++------------- nodepool/tests/__init__.py | 7 ++ nodepool/tests/test_nodepool.py | 44 ++++-------- 3 files changed, 96 insertions(+), 79 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 23840a0c6..677d1e9c8 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -175,11 +175,19 @@ class InstanceDeleter(threading.Thread, StatsReporter): self._node = node @staticmethod - def delete(zk, manager, node): + def delete(zk, manager, node, node_exists=True): ''' - Delete a node. + Delete a server instance and ZooKeeper node. This is a class method so we can support instantaneous deletes. + + :param ProviderManager manager: ProviderManager object to use for + deleting the server. + :param Node node: A locked Node object that describes the server to + delete. + :param bool node_exists: True if the node actually exists in ZooKeeper. + An artifical Node object can be passed that can be used to delete + a leaked instance. ''' try: manager.cleanupServer(node.external_id) @@ -192,17 +200,26 @@ class InstanceDeleter(threading.Thread, StatsReporter): "Exception deleting instance %s from %s:", node.external_id, node.provider) # Don't delete the ZK node in this case, but do unlock it - zk.unlockNode(node) + if node_exists: + zk.unlockNode(node) return - InstanceDeleter.log.info( - "Deleting ZK node id=%s, state=%s, external_id=%s", - node.id, node.state, node.external_id) - zk.unlockNode(node) - zk.deleteNode(node) + if node_exists: + InstanceDeleter.log.info( + "Deleting ZK node id=%s, state=%s, external_id=%s", + node.id, node.state, node.external_id) + zk.unlockNode(node) + zk.deleteNode(node) def run(self): - self.delete(self._zk, self._manager, self._node) + # Since leaked instances won't have an actual node in ZooKeeper, + # we need to check 'id' to see if this is an artificial Node. + if self._node.id is None: + node_exists = False + else: + node_exists = True + + self.delete(self._zk, self._manager, self._node, node_exists) try: self.updateNodeStats(self._zk, self._manager.provider) @@ -1022,6 +1039,54 @@ class NodeCleanupWorker(threading.Thread): # node from ZooKeeper if it succeeds. self._deleteInstance(node) + def _cleanupLeakedInstances(self): + ''' + Delete any leaked server instances. + + Remove any servers we find in providers we know about that are not + recorded in the ZooKeeper data. + ''' + zk_conn = self._nodepool.getZK() + + for provider in self._nodepool.config.providers.values(): + manager = self._nodepool.getProviderManager(provider.name) + + # NOTE: Cache the servers BEFORE caching the nodes. Doing this in + # the reverse order would create a race where a new server could + # be created just after we cache the list of nodes, thus making it + # incorrectly appear as leaked since we might not have cached the + # node for it. + servers = manager.listServers() + known = set([n.external_id for n in zk_conn.nodeIterator() if n.provider == provider.name]) + + for server in servers: + meta = server.get('metadata', {}).get('nodepool') + if not meta: + self.log.debug( + "Instance %s (%s) in %s has no nodepool metadata", + server.name, server.id, provider.name) + continue + + meta = json.loads(meta) + if meta['provider_name'] != provider.name: + # Another launcher, sharing this provider but configured + # with a different name, owns this. + continue + + if server.id not in known: + self.log.warning( + "Deleting leaked instance %s (%s) in %s", + server.name, server.id, provider.name + ) + # Create an artifical node to use for deleting the server. + node = zk.Node() + node.external_id = server.id + node.provider = provider.name + self._deleteInstance(node) + + if provider.clean_floating_ips: + manager.cleanupLeakedFloaters() + def run(self): self.log.info("Starting") self._running = True @@ -1036,6 +1101,7 @@ class NodeCleanupWorker(threading.Thread): try: self._cleanupNodeRequestLocks() self._cleanupNodes() + self._cleanupLeakedInstances() except Exception: self.log.exception("Exception in NodeCleanupWorker:") @@ -1586,46 +1652,6 @@ class NodePool(threading.Thread): self.statsd.incr(key) self.updateStats(session, node.provider_name) - def cleanupLeakedInstances(self): - known_providers = self.config.providers.keys() - for provider in self.config.providers.values(): - manager = self.getProviderManager(provider) - servers = manager.listServers() - with self.getDB().getSession() as session: - for server in servers: - meta = server.get('metadata', {}).get('nodepool') - if not meta: - self.log.debug("Instance %s (%s) in %s has no " - "nodepool metadata" % ( - server['name'], server['id'], - provider.name)) - continue - meta = json.loads(meta) - if meta['provider_name'] not in known_providers: - self.log.debug("Instance %s (%s) in %s " - "lists unknown provider %s" % ( - server['name'], server['id'], - provider.name, - meta['provider_name'])) - continue - node_id = meta.get('node_id') - if node_id: - if session.getNode(node_id): - continue - self.log.warning("Deleting leaked instance %s (%s) " - "in %s for node id: %s" % ( - server['name'], server['id'], - provider.name, node_id)) - self.deleteInstance(provider.name, server['id']) - else: - self.log.warning("Instance %s (%s) in %s has no " - "database id" % ( - server['name'], server['id'], - provider.name)) - continue - if provider.clean_floating_ips: - manager.cleanupLeakedFloaters() - def periodicCheck(self, session): # This function should be run periodically to make sure we can # still access hosts via ssh. diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py index a24657578..828efd276 100644 --- a/nodepool/tests/__init__.py +++ b/nodepool/tests/__init__.py @@ -433,6 +433,13 @@ class DBTestCase(BaseTestCase): break time.sleep(1) + def waitForInstanceDeletion(self, manager, instance_id): + while True: + servers = manager.listServers() + if not (instance_id in [s.id for s in servers]): + break + time.sleep(1) + def waitForNodeRequestLockDeletion(self, request_id): while True: exists = False diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index 1712edf5a..2789243c4 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -344,7 +344,6 @@ class TestNodepool(tests.DBTestCase): self.assertEqual(len(deleted_nodes), 1) self.assertEqual(node_id, deleted_nodes[0].id) - @skip("Disabled for early v3 development") def test_leaked_node(self): """Test that a leaked node is deleted""" configfile = self.setup_config('leaked_node.yaml') @@ -353,48 +352,33 @@ class TestNodepool(tests.DBTestCase): pool.start() self.waitForImage('fake-provider', 'fake-image') self.log.debug("Waiting for initial pool...") - self.waitForNodes(pool) + nodes = self.waitForNodes('fake-label') self.log.debug("...done waiting for initial pool.") # Make sure we have a node built and ready - provider = pool.config.providers['fake-provider'] - manager = pool.getProviderManager(provider) + self.assertEqual(len(nodes), 1) + manager = pool.getProviderManager('fake-provider') servers = manager.listServers() self.assertEqual(len(servers), 1) - with pool.getDB().getSession() as session: - nodes = session.getNodes(provider_name='fake-provider', - label_name='fake-label', - target_name='fake-target', - state=nodedb.READY) - self.assertEqual(len(nodes), 1) - # Delete the node from the db, but leave the instance - # so it is leaked. - self.log.debug("Delete node db record so instance is leaked...") - for node in nodes: - node.delete() - self.log.debug("...deleted node db so instance is leaked.") - nodes = session.getNodes(provider_name='fake-provider', - label_name='fake-label', - target_name='fake-target', - state=nodedb.READY) - self.assertEqual(len(nodes), 0) + # Delete the node from ZooKeeper, but leave the instance + # so it is leaked. + self.log.debug("Delete node db record so instance is leaked...") + self.zk.deleteNode(nodes[0]) + self.log.debug("...deleted node db so instance is leaked.") - # Wait for nodepool to replace it, which should be enough - # time for it to also delete the leaked node + # Wait for nodepool to replace it self.log.debug("Waiting for replacement pool...") - self.waitForNodes(pool) + new_nodes = self.waitForNodes('fake-label') self.log.debug("...done waiting for replacement pool.") + self.assertEqual(len(new_nodes), 1) + + # Wait for the instance to be cleaned up + self.waitForInstanceDeletion(manager, nodes[0].external_id) # Make sure we end up with only one server (the replacement) servers = manager.listServers() self.assertEqual(len(servers), 1) - with pool.getDB().getSession() as session: - nodes = session.getNodes(provider_name='fake-provider', - label_name='fake-label', - target_name='fake-target', - state=nodedb.READY) - self.assertEqual(len(nodes), 1) @skip("Disabled for early v3 development") def test_building_image_cleanup_on_start(self): From 7a0d29b039e0ac8207550ce4e3a4b7a4d55788ab Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Thu, 9 Mar 2017 10:43:44 -0500 Subject: [PATCH 090/309] Re-enable test_alien_list_fail and alien_list cmd Fixes the alien_list command for ZooKeeper, re-enables the test_alien_list_fail test for it. Also, we can now remove dead code from NodePool class. Change-Id: If571c85e8c9377497311ef232ca5e381b37dbfdd --- nodepool/cmd/nodepoolcmd.py | 38 ++-- nodepool/nodepool.py | 326 -------------------------------- nodepool/tests/test_commands.py | 3 +- 3 files changed, 21 insertions(+), 346 deletions(-) diff --git a/nodepool/cmd/nodepoolcmd.py b/nodepool/cmd/nodepoolcmd.py index a8c52f575..21b7d3517 100644 --- a/nodepool/cmd/nodepoolcmd.py +++ b/nodepool/cmd/nodepoolcmd.py @@ -181,30 +181,32 @@ class NodePoolCmd(NodepoolApp): self.zk.submitBuildRequest(diskimage) def alien_list(self): - self.pool.reconfigureManagers(self.pool.config, False) + self.pool.updateConfig() t = PrettyTable(["Provider", "Hostname", "Server ID", "IP"]) t.align = 'l' - with self.pool.getDB().getSession() as session: - for provider in self.pool.config.providers.values(): - if (self.args.provider and - provider.name != self.args.provider): - continue - manager = self.pool.getProviderManager(provider) - try: - for server in manager.listServers(): - if not session.getNodeByExternalID( - provider.name, server['id']): - t.add_row([provider.name, server['name'], - server['id'], server['public_v4']]) - except Exception as e: - log.warning("Exception listing aliens for %s: %s" - % (provider.name, str(e.message))) + for provider in self.pool.config.providers.values(): + if (self.args.provider and + provider.name != self.args.provider): + continue + manager = self.pool.getProviderManager(provider) + + try: + servers = manager.listServers() + known = set([n.external_id for n in self.zk.nodeIterator() + if n.provider == provider.name]) + for server in servers: + if server.id not in known: + t.add_row([provider.name, server.name, + server.id, server.public_v4]) + except Exception as e: + log.warning("Exception listing aliens for %s: %s" + % (provider.name, str(e.message))) print t def alien_image_list(self): - self.pool.reconfigureManagers(self.pool.config, False) + self.pool.updateConfig() t = PrettyTable(["Provider", "Name", "Image ID"]) t.align = 'l' @@ -362,7 +364,7 @@ class NodePoolCmd(NodepoolApp): if self.args.command in ('image-build', 'dib-image-list', 'image-list', 'dib-image-delete', 'image-delete', 'alien-image-list', - 'list', 'hold', 'delete'): + 'alien-list', 'list', 'hold', 'delete'): self.zk = zk.ZooKeeper() self.zk.connect(config.zookeeper_servers.values()) else: diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 677d1e9c8..5f5f55fcb 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -27,9 +27,6 @@ import socket import threading import time -import allocation -import jenkins_manager -import nodedb import exceptions import nodeutils as utils import provider_manager @@ -1167,47 +1164,6 @@ class NodePool(threading.Thread): nodepool_config.loadSecureConfig(config, self.securefile) return config - def reconfigureManagers(self, config, check_targets=True): - provider_manager.ProviderManager.reconfigure(self.config, config) - - stop_managers = [] - for t in config.targets.values(): - oldmanager = None - if self.config: - oldmanager = self.config.jenkins_managers.get(t.name) - if oldmanager: - if (t.jenkins_url != oldmanager.target.jenkins_url or - t.jenkins_user != oldmanager.target.jenkins_user or - t.jenkins_apikey != oldmanager.target.jenkins_apikey): - stop_managers.append(oldmanager) - oldmanager = None - if oldmanager: - config.jenkins_managers[t.name] = oldmanager - elif t.jenkins_url: - self.log.debug("Creating new JenkinsManager object " - "for %s" % t.name) - config.jenkins_managers[t.name] = \ - jenkins_manager.JenkinsManager(t) - config.jenkins_managers[t.name].start() - for oldmanager in stop_managers: - oldmanager.stop() - - # only do it if we need to check for targets - if check_targets: - for t in config.targets.values(): - if t.jenkins_url: - try: - info = config.jenkins_managers[t.name].getInfo() - if info['quietingDown']: - self.log.info("Target %s is offline" % t.name) - t.online = False - else: - t.online = True - except Exception: - self.log.exception("Unable to check status of %s" % - t.name) - t.online = False - def reconfigureZooKeeper(self, config): if self.config: running = self.config.zookeeper_servers.values() @@ -1229,174 +1185,12 @@ class NodePool(threading.Thread): def setConfig(self, config): self.config = config - def getDB(self): - return self.config.db - def getZK(self): return self.zk def getProviderManager(self, provider_name): return self.config.provider_managers[provider_name] - def getJenkinsManager(self, target): - if target.name in self.config.jenkins_managers: - return self.config.jenkins_managers[target.name] - else: - raise KeyError("{0} not in {1}".format(target.name, - self.config.jenkins_managers.keys())) - - def getNeededNodes(self, session, allocation_history): - self.log.debug("Beginning node launch calculation") - # Get the current demand for nodes. - label_demand = {} - - for name, demand in label_demand.items(): - self.log.debug(" Demand from gearman: %s: %s" % (name, demand)) - - online_targets = set() - for target in self.config.targets.values(): - if not target.online: - continue - online_targets.add(target.name) - - nodes = session.getNodes() - - def count_nodes(label_name, state): - return len([n for n in nodes - if (n.target_name in online_targets and - n.label_name == label_name and - n.state == state)]) - - def count_provider_nodes(provider_name): - count = 0 - for n in nodes: - if n.provider_name != provider_name: - continue - count += 1 - return count - - # Add a provider for each node provider, along with current - # capacity - allocation_providers = {} - for provider in self.config.providers.values(): - provider_max = provider.max_servers - n_provider = count_provider_nodes(provider.name) - available = provider_max - n_provider - if available < 0: - self.log.warning("Provider %s over-allocated: " - "max-servers %d but counted %d " % - (provider.name, provider_max, n_provider)) - available = 0 - ap = allocation.AllocationProvider(provider.name, available) - allocation_providers[provider.name] = ap - - # calculate demand for labels - # Actual need is: demand - (ready + building + used) - # NOTE(jhesketh): This assumes that the nodes in use are actually being - # used for a job in demand. - for label in self.config.labels.values(): - start_demand = label_demand.get(label.name, 0) - n_ready = count_nodes(label.name, nodedb.READY) - n_building = count_nodes(label.name, nodedb.BUILDING) - n_used = count_nodes(label.name, nodedb.USED) - n_test = count_nodes(label.name, nodedb.TEST) - ready = n_ready + n_building + n_used + n_test - - capacity = 0 - for provider in label.providers.values(): - capacity += allocation_providers[provider.name].available - - # Note actual_demand and extra_demand are written this way - # because max(0, x - y + z) != max(0, x - y) + z. - # The requested number of nodes minus those already available - actual_demand = max(0, start_demand - ready) - # Demand that accomodates extra demand from min-ready value - extra_demand = max(0, start_demand + label.min_ready - ready) - # We only request extras for the min ready value if there is - # clearly capacity for them. This is to avoid the allocator - # making bad choices spinning up nodes to satisfy min-ready when - # there is "real" work to do with other nodes. - if extra_demand <= capacity: - demand = extra_demand - else: - demand = actual_demand - - label_demand[label.name] = demand - self.log.debug(" Deficit: %s: %s " - "(start: %s min-ready: %s ready: %s capacity: %s)" % - (label.name, demand, - start_demand, label.min_ready, ready, capacity)) - - # "Target-Label-Provider" -- the triplet of info that identifies - # the source and location of each node. The mapping is - # AllocationGrantTarget -> TargetLabelProvider, because - # the allocation system produces AGTs as the final product. - tlps = {} - # label_name -> AllocationRequest - allocation_requests = {} - # Set up the request values in the allocation system - for target in self.config.targets.values(): - if not target.online: - continue - at = allocation.AllocationTarget(target.name) - for label in self.config.labels.values(): - ar = allocation_requests.get(label.name) - if not ar: - # A request for a certain number of nodes of this - # label type. We may have already started a - # request from a previous target-label in this - # loop. - ar = allocation.AllocationRequest(label.name, - label_demand[label.name], - allocation_history) - - nodes = session.getNodes(label_name=label.name, - target_name=target.name) - allocation_requests[label.name] = ar - ar.addTarget(at, len(nodes)) - for provider in label.providers.values(): - image = self.zk.getMostRecentImageUpload( - label.image, provider.name) - if image: - # This request may be supplied by this provider - # (and nodes from this provider supplying this - # request should be distributed to this target). - sr, agt = ar.addProvider( - allocation_providers[provider.name], - at, 0) - tlps[agt] = (target, label, - self.config.providers[provider.name]) - else: - self.log.debug(" %s does not have image %s " - "for label %s." % (provider.name, - label.image, - label.name)) - - self.log.debug(" Allocation requests:") - for ar in allocation_requests.values(): - self.log.debug(' %s' % ar) - for sr in ar.sub_requests.values(): - self.log.debug(' %s' % sr) - - nodes_to_launch = [] - - # Let the allocation system do it's thing, and then examine - # the AGT objects that it produces. - self.log.debug(" Grants:") - for ap in allocation_providers.values(): - ap.makeGrants() - for g in ap.grants: - self.log.debug(' %s' % g) - for agt in g.targets: - self.log.debug(' %s' % agt) - tlp = tlps[agt] - nodes_to_launch.append((tlp, agt.amount)) - - allocation_history.grantsDone() - - self.log.debug("Finished node launch calculation") - return nodes_to_launch - def updateConfig(self): config = self.loadConfig() provider_manager.ProviderManager.reconfigure(self.config, config) @@ -1557,123 +1351,3 @@ class NodePool(threading.Thread): self._wake_condition.acquire() self._wake_condition.wait(self.watermark_sleep) self._wake_condition.release() - - def _run(self, session, allocation_history): - nodes_to_launch = self.getNeededNodes(session, allocation_history) - - for (tlp, num_to_launch) in nodes_to_launch: - (target, label, provider) = tlp - if (not target.online) or (not num_to_launch): - continue - self.log.info("Need to launch %s %s nodes for %s on %s" % - (num_to_launch, label.name, - target.name, provider.name)) - for i in range(num_to_launch): - cloud_image = self.zk.getMostRecentImageUpload( - label.image, provider.name) - if not cloud_image: - self.log.debug("No current image for %s on %s" - % (label.image, provider.name)) - else: - self.launchNode(session, provider, label, target) - - def launchNode(self, session, provider, label, target): - try: - self._launchNode(session, provider, label, target) - except Exception: - self.log.exception( - "Could not launch node %s on %s", label.name, provider.name) - - def _launchNode(self, session, provider, label, target): - provider = self.config.providers[provider.name] - timeout = provider.boot_timeout - launch_timeout = provider.launch_timeout - if provider.azs: - az = random.choice(provider.azs) - else: - az = None - node = session.createNode(provider.name, label.name, target.name, az) - t = NodeLauncher(self, provider, label, target, node.id, timeout, - launch_timeout) - t.start() - - def _deleteNode(self, session, node): - self.log.debug("Deleting node id: %s which has been in %s " - "state for %s hours" % - (node.id, nodedb.STATE_NAMES[node.state], - (time.time() - node.state_time) / (60 * 60))) - # Delete a node - if node.state != nodedb.DELETE: - # Don't write to the session if not needed. - node.state = nodedb.DELETE - self.updateStats(session, node.provider_name) - provider = self.config.providers[node.provider_name] - target = self.config.targets[node.target_name] - label = self.config.labels.get(node.label_name, None) - if label and label.image in provider.images: - image_name = provider.images[label.image].name - else: - image_name = None - manager = self.getProviderManager(provider) - - if target.jenkins_url and (node.nodename is not None): - jenkins = self.getJenkinsManager(target) - jenkins_name = node.nodename - if jenkins.nodeExists(jenkins_name): - jenkins.deleteNode(jenkins_name) - self.log.info("Deleted jenkins node id: %s" % node.id) - - if node.manager_name is not None: - try: - self.revokeAssignedNode(node) - except Exception: - self.log.exception("Exception revoking node id: %s" % - node.id) - - if node.external_id: - try: - self.log.debug('Deleting server %s for node id: %s' % - (node.external_id, node.id)) - manager.cleanupServer(node.external_id) - manager.waitForServerDeletion(node.external_id) - except provider_manager.NotFound: - pass - node.external_id = None - - node.delete() - self.log.info("Deleted node id: %s" % node.id) - - if self.statsd: - dt = int((time.time() - node.state_time) * 1000) - key = 'nodepool.delete.%s.%s.%s' % (image_name, - node.provider_name, - node.target_name) - self.statsd.timing(key, dt) - self.statsd.incr(key) - self.updateStats(session, node.provider_name) - - def periodicCheck(self, session): - # This function should be run periodically to make sure we can - # still access hosts via ssh. - - self.log.debug("Starting periodic check") - for node in session.getNodes(): - if node.state != nodedb.READY: - continue - provider = self.config.providers[node.provider_name] - if node.label_name in self.config.labels: - label = self.config.labels[node.label_name] - image = provider.images[label.image] - connect_kwargs = dict(key_filename=image.private_key) - try: - if utils.ssh_connect(node.ip, image.username, - connect_kwargs=connect_kwargs): - continue - except Exception: - self.log.exception("SSH Check failed for node id: %s" % - node.id) - else: - self.log.exception("Node with non-existing label %s" % - node.label_name) - self.deleteNode(node.id) - self.log.debug("Finished periodic check") diff --git a/nodepool/tests/test_commands.py b/nodepool/tests/test_commands.py index 81a01300b..0d938bcc6 100644 --- a/nodepool/tests/test_commands.py +++ b/nodepool/tests/test_commands.py @@ -89,7 +89,6 @@ class TestNodepoolCMD(tests.DBTestCase): self.waitForUploadRecordDeletion('fake-provider', 'fake-image', image.build_id, image.id) - @skip("Disabled for early v3 development") def test_alien_list_fail(self): def fail_list(self): raise RuntimeError('Fake list error') @@ -98,7 +97,7 @@ class TestNodepoolCMD(tests.DBTestCase): fail_list)) configfile = self.setup_config("node_cmd.yaml") - self.patch_argv("-c", configfile, "alien-list") + self.patch_argv("-c", configfile, "alien-list", "fakeprovider") nodepoolcmd.main() def test_alien_image_list_empty(self): From 0c7146053eff5095b6b87131350ed0ca45de9099 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Fri, 10 Mar 2017 10:09:46 -0500 Subject: [PATCH 091/309] Remove job_list, job_create, job_delete cmds/tests These don't make sense in the ZuulV3 world. Change-Id: I2cf4d5a546f61cebbbe1c708b675aaa496fabdf5 --- nodepool/cmd/nodepoolcmd.py | 41 --------------------------------- nodepool/tests/test_commands.py | 21 ----------------- 2 files changed, 62 deletions(-) diff --git a/nodepool/cmd/nodepoolcmd.py b/nodepool/cmd/nodepoolcmd.py index 21b7d3517..197d03af3 100644 --- a/nodepool/cmd/nodepoolcmd.py +++ b/nodepool/cmd/nodepoolcmd.py @@ -125,23 +125,6 @@ class NodePoolCmd(NodepoolApp): help='Validate configuration file') cmd_config_validate.set_defaults(func=self.config_validate) - cmd_job_list = subparsers.add_parser('job-list', help='list jobs') - cmd_job_list.set_defaults(func=self.job_list) - - cmd_job_create = subparsers.add_parser('job-create', help='create job') - cmd_job_create.add_argument( - 'name', - help='job name') - cmd_job_create.add_argument('--hold-on-failure', - help='number of nodes to hold when this job fails') - cmd_job_create.set_defaults(func=self.job_create) - - cmd_job_delete = subparsers.add_parser( - 'job-delete', - help='delete job') - cmd_job_delete.set_defaults(func=self.job_delete) - cmd_job_delete.add_argument('id', help='job id') - self.args = parser.parse_args() def setup_logging(self): @@ -323,28 +306,6 @@ class NodePoolCmd(NodepoolApp): log.info("Configuration validation complete") #TODO(asselin,yolanda): add validation of secure.conf - def job_list(self): - t = PrettyTable(["ID", "Name", "Hold on Failure"]) - t.align = 'l' - with self.pool.getDB().getSession() as session: - for job in session.getJobs(): - t.add_row([job.id, job.name, job.hold_on_failure]) - print t - - def job_create(self): - with self.pool.getDB().getSession() as session: - session.createJob(self.args.name, - hold_on_failure=self.args.hold_on_failure) - self.job_list() - - def job_delete(self): - with self.pool.getDB().getSession() as session: - job = session.getJob(self.args.id) - if not job: - print "Job %s not found." % self.args.id - else: - job.delete() - def _wait_for_threads(self, threads): for t in threads: if t: @@ -367,8 +328,6 @@ class NodePoolCmd(NodepoolApp): 'alien-list', 'list', 'hold', 'delete'): self.zk = zk.ZooKeeper() self.zk.connect(config.zookeeper_servers.values()) - else: - self.pool.reconfigureDatabase(config) self.pool.setConfig(config) self.args.func() diff --git a/nodepool/tests/test_commands.py b/nodepool/tests/test_commands.py index 0d938bcc6..40a05bbda 100644 --- a/nodepool/tests/test_commands.py +++ b/nodepool/tests/test_commands.py @@ -21,8 +21,6 @@ import fixtures import mock import testtools -from unittest import skip - from nodepool.cmd import nodepoolcmd from nodepool import tests from nodepool import zk @@ -268,22 +266,3 @@ class TestNodepoolCMD(tests.DBTestCase): self.waitForImage('fake-provider', 'fake-image', [image]) self.assert_listed(configfile, ['dib-image-list'], 4, zk.READY, 2) - - @skip("Disabled for early v3 development") - def test_job_create(self): - configfile = self.setup_config('node.yaml') - self.patch_argv("-c", configfile, "job-create", "fake-job", - "--hold-on-failure", "1") - nodepoolcmd.main() - self.assert_listed(configfile, ['job-list'], 2, 1, 1) - - @skip("Disabled for early v3 development") - def test_job_delete(self): - configfile = self.setup_config('node.yaml') - self.patch_argv("-c", configfile, "job-create", "fake-job", - "--hold-on-failure", "1") - nodepoolcmd.main() - self.assert_listed(configfile, ['job-list'], 2, 1, 1) - self.patch_argv("-c", configfile, "job-delete", "1") - nodepoolcmd.main() - self.assert_listed(configfile, ['job-list'], 0, 1, 0) From 03299d3fc1849ffcef186772e6fc324c9fb177a6 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Fri, 10 Mar 2017 10:24:18 -0500 Subject: [PATCH 092/309] Re-enable test_image_upload_fail Not entirely certain why this was disabled, but meh. Change-Id: Ibebaa1cb26300219491058ad36ecfb430e34de9a --- nodepool/tests/test_builder.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/nodepool/tests/test_builder.py b/nodepool/tests/test_builder.py index 9d95f45a1..476f6998e 100644 --- a/nodepool/tests/test_builder.py +++ b/nodepool/tests/test_builder.py @@ -15,7 +15,6 @@ import os import fixtures -from unittest import skip from nodepool import builder, exceptions, fakeprovider, tests from nodepool import zk @@ -96,7 +95,6 @@ class TestNodePoolBuilder(tests.DBTestCase): nb.start() nb.stop() - @skip("Disabled for early v3 development") def test_image_upload_fail(self): """Test that image upload fails are handled properly.""" From 94c63821bc2cce7b173e1e26548ab43b8e457591 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Fri, 10 Mar 2017 10:33:09 -0500 Subject: [PATCH 093/309] Re-enable test_nodepool_osc_config_reload This required adding a check to the Nodepool thread to verify it was started before we join on it. Change-Id: I3cd7395233ac77cbd5805bcd7dd5a88e576cba26 --- nodepool/nodepool.py | 3 ++- nodepool/tests/test_shade_integration.py | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 5f5f55fcb..c8ccbe3b7 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -1155,7 +1155,8 @@ class NodePool(threading.Thread): self.log.debug("Waiting for %s" % thd.name) thd.join() - self.join() + if self.isAlive(): + self.join() self.zk.disconnect() self.log.debug("Finished stopping") diff --git a/nodepool/tests/test_shade_integration.py b/nodepool/tests/test_shade_integration.py index 6da6a1b27..6c06e924c 100644 --- a/nodepool/tests/test_shade_integration.py +++ b/nodepool/tests/test_shade_integration.py @@ -20,8 +20,6 @@ import shade import testtools import yaml -from unittest import skip - from nodepool import config as nodepool_config from nodepool import provider_manager from nodepool import tests @@ -75,7 +73,6 @@ class TestShadeIntegration(tests.IntegrationTestCase): pm.start() self.assertEqual(pm._client.auth, auth_data) - @skip("Disabled for early v3 development") def test_nodepool_osc_config_reload(self): configfile = self.setup_config('integration_osc.yaml') auth_data = {'username': 'os_real', From f55a9236f4173910e239ccda5e7209bacbd148cd Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Fri, 10 Mar 2017 10:42:01 -0500 Subject: [PATCH 094/309] Re-enable TestWebApp tests Change-Id: I3c245634d0e0f23407eb54898b1208c633c439eb --- nodepool/tests/test_webapp.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/nodepool/tests/test_webapp.py b/nodepool/tests/test_webapp.py index dd3627f7b..08295bb4c 100644 --- a/nodepool/tests/test_webapp.py +++ b/nodepool/tests/test_webapp.py @@ -17,14 +17,12 @@ import json import logging import urllib2 -from unittest import skip from nodepool import tests class TestWebApp(tests.DBTestCase): log = logging.getLogger("nodepool.TestWebApp") - @skip("Disabled for early v3 development") def test_image_list(self): configfile = self.setup_config('node.yaml') pool = self.useNodepool(configfile, watermark_sleep=1) @@ -35,7 +33,7 @@ class TestWebApp(tests.DBTestCase): port = webapp.server.socket.getsockname()[1] self.waitForImage('fake-provider', 'fake-image') - self.waitForNodes(pool) + self.waitForNodes('fake-label') req = urllib2.Request( "http://localhost:%s/image-list" % port) @@ -45,7 +43,6 @@ class TestWebApp(tests.DBTestCase): data = f.read() self.assertTrue('fake-image' in data) - @skip("Disabled for early v3 development") def test_dib_image_list_json(self): configfile = self.setup_config('node.yaml') pool = self.useNodepool(configfile, watermark_sleep=1) @@ -56,7 +53,7 @@ class TestWebApp(tests.DBTestCase): port = webapp.server.socket.getsockname()[1] self.waitForImage('fake-provider', 'fake-image') - self.waitForNodes(pool) + self.waitForNodes('fake-label') req = urllib2.Request( "http://localhost:%s/dib-image-list.json" % port) From e2d103005e77fb79edd0e6641f6e3fdc90976690 Mon Sep 17 00:00:00 2001 From: Monty Taylor Date: Fri, 10 Mar 2017 10:04:37 -0600 Subject: [PATCH 095/309] Rename osc to occ in tests occ is the general acronyn for os-client-config while osc generally stands for python-openstackclient. While it doesn't actually matter, it took me half a second to realize while reviewing the previous change that we weren't trying to do something with python-openstackclient. Change-Id: I201f8c5b987bd03ba4798a316540a96c9c8defde --- ...tegration_osc.yaml => integration_occ.yaml} | 0 nodepool/tests/test_shade_integration.py | 18 +++++++++--------- 2 files changed, 9 insertions(+), 9 deletions(-) rename nodepool/tests/fixtures/{integration_osc.yaml => integration_occ.yaml} (100%) diff --git a/nodepool/tests/fixtures/integration_osc.yaml b/nodepool/tests/fixtures/integration_occ.yaml similarity index 100% rename from nodepool/tests/fixtures/integration_osc.yaml rename to nodepool/tests/fixtures/integration_occ.yaml diff --git a/nodepool/tests/test_shade_integration.py b/nodepool/tests/test_shade_integration.py index 6c06e924c..d6bf7bae5 100644 --- a/nodepool/tests/test_shade_integration.py +++ b/nodepool/tests/test_shade_integration.py @@ -57,14 +57,14 @@ class TestShadeIntegration(tests.IntegrationTestCase): self.assertEqual(pm._client.auth, auth_data) self.assertEqual(pm._client.region_name, 'real-region') - def test_nodepool_osc_config(self): - configfile = self.setup_config('integration_osc.yaml') + def test_nodepool_occ_config(self): + configfile = self.setup_config('integration_occ.yaml') auth_data = {'username': 'os_real', 'project_name': 'os_real', 'password': 'os_real', 'auth_url': 'os_real'} - osc_config = {'clouds': {'real-cloud': {'auth': auth_data}}} - self._use_cloud_config(osc_config) + occ_config = {'clouds': {'real-cloud': {'auth': auth_data}}} + self._use_cloud_config(occ_config) config = nodepool_config.loadConfig(configfile) self.assertIn('real-provider', config.providers) @@ -73,14 +73,14 @@ class TestShadeIntegration(tests.IntegrationTestCase): pm.start() self.assertEqual(pm._client.auth, auth_data) - def test_nodepool_osc_config_reload(self): - configfile = self.setup_config('integration_osc.yaml') + def test_nodepool_occ_config_reload(self): + configfile = self.setup_config('integration_occ.yaml') auth_data = {'username': 'os_real', 'project_name': 'os_real', 'password': 'os_real', 'auth_url': 'os_real'} - osc_config = {'clouds': {'real-cloud': {'auth': auth_data}}} - self._use_cloud_config(osc_config) + occ_config = {'clouds': {'real-cloud': {'auth': auth_data}}} + self._use_cloud_config(occ_config) pool = self.useNodepool(configfile, watermark_sleep=1) pool.updateConfig() @@ -91,7 +91,7 @@ class TestShadeIntegration(tests.IntegrationTestCase): auth_data['password'] = 'os_new_real' os.remove(self.clouds_path) with open(self.clouds_path, 'w') as h: - yaml.safe_dump(osc_config, h) + yaml.safe_dump(occ_config, h) pool.updateConfig() provider_manager = pool.config.provider_managers['real-provider'] From a3d822dcdeff5a44c4b0ee9b83c6e24ee8bfc1ff Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Sun, 29 Jan 2017 10:51:23 -0500 Subject: [PATCH 096/309] Fix fedora 25 pause bug with devstack We mistakenly skipped this setting, as a results fedora-25 images are built by default, causing un needed churn for our nodepool project jobs. Change-Id: Id91991a490709f9bbac5a4f6e9847e047b83ca51 Signed-off-by: Paul Belanger --- devstack/settings | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devstack/settings b/devstack/settings index 50a7faf83..888dcc22a 100644 --- a/devstack/settings +++ b/devstack/settings @@ -7,7 +7,7 @@ NODEPOOL_DIB_BASE_PATH=/opt/dib # NOTE(pabelanger): Be sure to also update tools/check_devstack_plugin.sh if you # change the defaults. NODEPOOL_PAUSE_CENTOS_7_DIB=${NODEPOOL_PAUSE_CENTOS_7_DIB:-true} -NODEPOOL_PAUSE_FEDORA_24_DIB=${NODEPOOL_PAUSE_FEDORA_24_DIB:-true} +NODEPOOL_PAUSE_FEDORA_25_DIB=${NODEPOOL_PAUSE_FEDORA_25_DIB:-true} NODEPOOL_PAUSE_UBUNTU_PRECISE_DIB=${NODEPOOL_PAUSE_UBUNTU_PRECISE_DIB:-true} NODEPOOL_PAUSE_UBUNTU_TRUSTY_DIB=${NODEPOOL_PAUSE_UBUNTU_TRUSTY_DIB:-false} NODEPOOL_PAUSE_UBUNTU_XENIAL_DIB=${NODEPOOL_PAUSE_UBUNTU_XENIAL_DIB:-true} From 2a5698251e63452d99c9490b49f4d2ff58057d46 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Fri, 10 Mar 2017 09:23:38 -0800 Subject: [PATCH 097/309] Remove allocator This is no longer used. Change-Id: I69f588e548f2d1fb7f99217d3aef6bc24632c7e1 --- nodepool/allocation.py | 416 ----------------------------- nodepool/tests/test_allocator.py | 444 ------------------------------- 2 files changed, 860 deletions(-) delete mode 100644 nodepool/allocation.py delete mode 100644 nodepool/tests/test_allocator.py diff --git a/nodepool/allocation.py b/nodepool/allocation.py deleted file mode 100644 index c834dffc0..000000000 --- a/nodepool/allocation.py +++ /dev/null @@ -1,416 +0,0 @@ -#!/usr/bin/env python - -# Copyright (C) 2013 OpenStack Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This module holds classes that represent concepts in nodepool's -allocation algorithm. - -The algorithm is: - - Setup: - - * Establish the node providers with their current available - capacity. - * Establish requests that are to be made of each provider for a - certain label. - * Indicate which providers can supply nodes of that label. - * Indicate to which targets nodes of a certain label from a certain - provider may be distributed (and the weight that should be - given to each target when distributing). - - Run: - - * For each label, set the requested number of nodes from each - provider to be proportional to that providers overall capacity. - - * Define the 'priority' of a request as the number of requests for - the same label from other providers. - - * For each provider, sort the requests by the priority. This puts - requests that can be serviced by the fewest providers first. - - * Grant each such request in proportion to that requests portion of - the total amount requested by requests of the same priority. - - * The nodes allocated by a grant are then distributed to the targets - which are associated with the provider and label, in proportion to - that target's portion of the sum of the weights of each target for - that label. -""" - -import functools - -# History allocation tracking - -# The goal of the history allocation tracking is to ensure forward -# progress by not starving any particular label when in over-quota -# situations. For example, if you have two labels, say 'fedora' and -# 'ubuntu', and 'ubuntu' is requesting many more nodes than 'fedora', -# it is quite possible that 'fedora' never gets any allocations. If -# 'fedora' is required for a gate-check job, older changes may wait -# in Zuul's pipelines longer than expected while jobs for newer -# changes continue to receive 'ubuntu' nodes and overall merge -# throughput decreases during such contention. -# -# We track the history of allocations by label. A persistent -# AllocationHistory object should be kept and passed along with each -# AllocationRequest, which records its initial request in the history -# via recordRequest(). -# -# When a sub-allocation gets a grant, it records this via a call to -# AllocationHistory.recordGrant(). All the sub-allocations -# contribute to tracking the total grants for the parent -# AllocationRequest. -# -# When finished requesting grants from all providers, -# AllocationHistory.grantsDone() should be called to store the -# allocation state in the history. -# -# This history is used AllocationProvider.makeGrants() to prioritize -# requests that have not been granted in prior iterations. -# AllocationHistory.getWaitTime will return how many iterations -# each label has been waiting for an allocation. - - -class AllocationHistory(object): - '''A history of allocation requests and grants''' - - def __init__(self, history=100): - # current allocations for this iteration - # keeps elements of type - # label -> (request, granted) - self.current_allocations = {} - - self.history = history - # list of up to previous current_allocation - # dictionaries - self.past_allocations = [] - - def recordRequest(self, label, amount): - try: - a = self.current_allocations[label] - a['requested'] += amount - except KeyError: - self.current_allocations[label] = dict(requested=amount, - allocated=0) - - def recordGrant(self, label, amount): - try: - a = self.current_allocations[label] - a['allocated'] += amount - except KeyError: - # granted but not requested? shouldn't happen - raise - - def grantsDone(self): - # save this round of allocations/grants up to our history - self.past_allocations.insert(0, self.current_allocations) - self.past_allocations = self.past_allocations[:self.history] - self.current_allocations = {} - - def getWaitTime(self, label): - # go through the history of allocations and calculate how many - # previous iterations this label has received none of its - # requested allocations. - wait = 0 - - # We don't look at the current_alloctions here; only - # historical. With multiple providers, possibly the first - # provider has given nodes to the waiting label (which would - # be recorded in current_allocations), and a second provider - # should fall back to using the usual ratio-based mechanism? - for i, a in enumerate(self.past_allocations): - if (label in a) and (a[label]['allocated'] == 0): - wait = i + 1 - continue - - # only interested in consecutive failures to allocate. - break - - return wait - - -class AllocationProvider(object): - """A node provider and its capacity.""" - def __init__(self, name, available): - self.name = name - # if this is negative, many of the calcuations turn around and - # we start handing out nodes that don't exist. - self.available = available if available >= 0 else 0 - self.sub_requests = [] - self.grants = [] - - def __repr__(self): - return '' % self.name - - def makeGrants(self): - # build a list of (request,wait-time) tuples - all_reqs = [(x, x.getWaitTime()) for x in self.sub_requests] - - # reqs with no wait time get processed via ratio mechanism - reqs = [x[0] for x in all_reqs if x[1] == 0] - - # we prioritize whoever has been waiting the longest and give - # them whatever is available. If we run out, put them back in - # the ratio queue - waiters = [x for x in all_reqs if x[1] != 0] - waiters.sort(key=lambda x: x[1], reverse=True) - - for w in waiters: - w = w[0] - if self.available > 0: - w.grant(min(int(w.amount), self.available)) - else: - reqs.append(w) - - # Sort the remaining requests by priority so we fill the most - # specific requests first (e.g., if this provider is the only - # one that can supply foo nodes, then it should focus on - # supplying them and leave bar nodes to other providers). - reqs.sort(lambda a, b: cmp(a.getPriority(), b.getPriority())) - - for req in reqs: - total_requested = 0.0 - # Within a specific priority, limit the number of - # available nodes to a value proportionate to the request. - reqs_at_this_level = [r for r in reqs - if r.getPriority() == req.getPriority()] - for r in reqs_at_this_level: - total_requested += r.amount - if total_requested: - ratio = float(req.amount) / total_requested - else: - ratio = 0.0 - - grant = int(round(req.amount)) - grant = min(grant, int(round(self.available * ratio))) - # This adjusts our availability as well as the values of - # other requests, so values will be correct the next time - # through the loop. - req.grant(grant) - - -class AllocationRequest(object): - """A request for a number of labels.""" - - def __init__(self, name, amount, history=None): - self.name = name - self.amount = float(amount) - # Sub-requests of individual providers that make up this - # request. AllocationProvider -> AllocationSubRequest - self.sub_requests = {} - # Targets to which nodes from this request may be assigned. - # AllocationTarget -> AllocationRequestTarget - self.request_targets = {} - - if history is not None: - self.history = history - else: - self.history = AllocationHistory() - - self.history.recordRequest(name, amount) - - # subrequests use these - self.recordGrant = functools.partial(self.history.recordGrant, name) - self.getWaitTime = functools.partial(self.history.getWaitTime, name) - - def __repr__(self): - return '' % (self.amount, self.name) - - def addTarget(self, target, current): - art = AllocationRequestTarget(self, target, current) - self.request_targets[target] = art - - def addProvider(self, provider, target): - # Handle being called multiple times with different targets. - s = self.sub_requests.get(provider) - if not s: - s = AllocationSubRequest(self, provider) - agt = s.addTarget(self.request_targets[target]) - self.sub_requests[provider] = s - if s not in provider.sub_requests: - provider.sub_requests.append(s) - self.makeRequests() - return s, agt - - def makeRequests(self): - # (Re-)distribute this request across all of its providers. - total_available = 0.0 - for sub_request in self.sub_requests.values(): - total_available += sub_request.provider.available - for sub_request in self.sub_requests.values(): - if total_available: - ratio = float(sub_request.provider.available) / total_available - else: - ratio = 0.0 - sub_request.setAmount(ratio * self.amount) - - -class AllocationSubRequest(object): - """A request for a number of images from a specific provider.""" - def __init__(self, request, provider): - self.request = request - self.provider = provider - self.amount = 0.0 - self.targets = [] - - def __repr__(self): - return '' % ( - self.amount, self.request.amount, self.request.name, - self.provider.name) - - def addTarget(self, request_target): - agt = AllocationGrantTarget(self, request_target) - self.targets.append(agt) - return agt - - def setAmount(self, amount): - self.amount = amount - - def getPriority(self): - return len(self.request.sub_requests) - - def getWaitTime(self): - return self.request.getWaitTime() - - def grant(self, amount): - # Grant this request (with the supplied amount). Adjust this - # sub-request's value to the actual, as well as the values of - # any remaining sub-requests. - - # fractional amounts don't make sense - assert int(amount) == amount - - # Remove from the set of sub-requests so that this is not - # included in future calculations. - self.provider.sub_requests.remove(self) - del self.request.sub_requests[self.provider] - if amount > 0: - grant = AllocationGrant(self.request, self.provider, - amount, self.targets) - self.request.recordGrant(amount) - # This is now a grant instead of a request. - self.provider.grants.append(grant) - else: - grant = None - amount = 0 - self.amount = amount - # Adjust provider and request values accordingly. - self.request.amount -= amount - self.provider.available -= (amount) - # Adjust the requested values for related sub-requests. - self.request.makeRequests() - # Allocate these granted nodes to targets. - if grant: - grant.makeAllocations() - - -class AllocationGrant(object): - """A grant of a certain number of nodes of an image from a - specific provider.""" - - def __init__(self, request, provider, amount, targets): - self.request = request - self.provider = provider - self.amount = amount - self.targets = targets - - def __repr__(self): - return '' % ( - self.amount, self.request.name, self.provider.name) - - def makeAllocations(self): - # Allocate this grant to the linked targets. - total_current = 0 - for agt in self.targets: - total_current += agt.request_target.current - amount = self.amount - # Add the nodes in this allocation to the total number of - # nodes for this image so that we're setting our target - # allocations based on a portion of the total future nodes. - total_current += amount - remaining_targets = len(self.targets) - for agt in self.targets: - # Evenly distribute the grants across all targets - ratio = 1.0 / remaining_targets - # Take the weight and apply it to the total number of - # nodes to this image to figure out how many of the total - # nodes should ideally be on this target. - desired_count = int(round(ratio * total_current)) - # The number of nodes off from our calculated target. - delta = desired_count - agt.request_target.current - # Use the delta as the allocation for this target, but - # make sure it's bounded by 0 and the number of nodes we - # have available to allocate. - allocation = min(delta, amount) - allocation = max(allocation, 0) - - # The next time through the loop, we have reduced our - # grant by this amount. - amount -= allocation - # Don't consider this target's count in the total number - # of nodes in the next iteration, nor the nodes we have - # just allocated. - total_current -= agt.request_target.current - total_current -= allocation - # Since we aren't considering this target's count, also - # don't consider this target itself when calculating the - # ratio. - remaining_targets -= 1 - # Set the amount of this allocation. - agt.allocate(allocation) - - -class AllocationTarget(object): - """A target to which nodes may be assigned.""" - def __init__(self, name): - self.name = name - - def __repr__(self): - return '' % (self.name) - - -class AllocationRequestTarget(object): - """A request associated with a target to which nodes may be assigned.""" - def __init__(self, request, target, current): - self.target = target - self.request = request - self.current = current - - -class AllocationGrantTarget(object): - """A target for a specific grant to which nodes may be assigned.""" - def __init__(self, sub_request, request_target): - self.sub_request = sub_request - self.request_target = request_target - self.amount = 0 - - def __repr__(self): - return '' % ( - self.amount, self.sub_request.request.name, - self.request_target.target.name) - - def allocate(self, amount): - # This is essentially the output of this system. This - # represents the number of nodes of a specific image from a - # specific provider that should be assigned to a specific - # target. - self.amount = amount - # Update the number of nodes of this image that are assigned - # to this target to assist in other allocation calculations - self.request_target.current += amount diff --git a/nodepool/tests/test_allocator.py b/nodepool/tests/test_allocator.py deleted file mode 100644 index 23279f606..000000000 --- a/nodepool/tests/test_allocator.py +++ /dev/null @@ -1,444 +0,0 @@ -# Copyright (C) 2014 OpenStack Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import testscenarios - -from nodepool import tests -from nodepool import allocation - - -class OneLabel(tests.AllocatorTestCase, tests.BaseTestCase): - """The simplest case: one each of providers, labels, and - targets. - - Result AGT is: - * label1 from provider1 - """ - - scenarios = [ - ('one_node', - dict(provider1=10, label1=1, results=[1])), - ('two_nodes', - dict(provider1=10, label1=2, results=[2])), - ] - - def setUp(self): - super(OneLabel, self).setUp() - ap1 = allocation.AllocationProvider('provider1', self.provider1) - at1 = allocation.AllocationTarget('target1') - ar1 = allocation.AllocationRequest('label1', self.label1) - ar1.addTarget(at1, 0) - self.agt.append(ar1.addProvider(ap1, at1)[1]) - ap1.makeGrants() - - -class TwoLabels(tests.AllocatorTestCase, tests.BaseTestCase): - """Two labels from one provider. - - Result AGTs are: - * label1 from provider1 - * label1 from provider2 - """ - - scenarios = [ - ('one_node', - dict(provider1=10, label1=1, label2=1, results=[1, 1])), - ('two_nodes', - dict(provider1=10, label1=2, label2=2, results=[2, 2])), - ] - - def setUp(self): - super(TwoLabels, self).setUp() - ap1 = allocation.AllocationProvider('provider1', self.provider1) - at1 = allocation.AllocationTarget('target1') - ar1 = allocation.AllocationRequest('label1', self.label1) - ar2 = allocation.AllocationRequest('label2', self.label2) - ar1.addTarget(at1, 0) - ar2.addTarget(at1, 0) - self.agt.append(ar1.addProvider(ap1, at1)[1]) - self.agt.append(ar2.addProvider(ap1, at1)[1]) - ap1.makeGrants() - - -class TwoProvidersTwoLabels(tests.AllocatorTestCase, tests.BaseTestCase): - """Two labels, each of which is supplied by both providers. - - Result AGTs are: - * label1 from provider1 - * label2 from provider1 - * label1 from provider2 - * label2 from provider2 - """ - - scenarios = [ - ('one_node', - dict(provider1=10, provider2=10, label1=1, label2=1, - results=[1, 1, 0, 0])), - ('two_nodes', - dict(provider1=10, provider2=10, label1=2, label2=2, - results=[1, 1, 1, 1])), - ('three_nodes', - dict(provider1=10, provider2=10, label1=3, label2=3, - results=[2, 2, 1, 1])), - ('four_nodes', - dict(provider1=10, provider2=10, label1=4, label2=4, - results=[2, 2, 2, 2])), - ('four_nodes_at_quota', - dict(provider1=4, provider2=4, label1=4, label2=4, - results=[2, 2, 2, 2])), - ('four_nodes_over_quota', - dict(provider1=2, provider2=2, label1=4, label2=4, - results=[1, 1, 1, 1])), - ('negative_provider', - dict(provider1=-5, provider2=20, label1=5, label2=5, - results=[0, 0, 5, 5])), - ] - - def setUp(self): - super(TwoProvidersTwoLabels, self).setUp() - ap1 = allocation.AllocationProvider('provider1', self.provider1) - ap2 = allocation.AllocationProvider('provider2', self.provider2) - at1 = allocation.AllocationTarget('target1') - ar1 = allocation.AllocationRequest('label1', self.label1) - ar2 = allocation.AllocationRequest('label2', self.label2) - ar1.addTarget(at1, 0) - ar2.addTarget(at1, 0) - self.agt.append(ar1.addProvider(ap1, at1)[1]) - self.agt.append(ar2.addProvider(ap1, at1)[1]) - self.agt.append(ar1.addProvider(ap2, at1)[1]) - self.agt.append(ar2.addProvider(ap2, at1)[1]) - ap1.makeGrants() - ap2.makeGrants() - - -class TwoProvidersTwoLabelsOneShared(tests.AllocatorTestCase, - tests.BaseTestCase): - """One label is served by both providers, the other can only come - from one. This tests that the allocator uses the diverse provider - to supply the label that can come from either while reserving - nodes from the more restricted provider for the label that can - only be supplied by it. - - label1 is supplied by provider1 and provider2. - label2 is supplied only by provider2. - - Result AGTs are: - * label1 from provider1 - * label2 from provider1 - * label2 from provider2 - """ - - scenarios = [ - ('one_node', - dict(provider1=10, provider2=10, label1=1, label2=1, - results=[1, 1, 0])), - ('two_nodes', - dict(provider1=10, provider2=10, label1=2, label2=2, - results=[2, 1, 1])), - ('three_nodes', - dict(provider1=10, provider2=10, label1=3, label2=3, - results=[3, 2, 1])), - ('four_nodes', - dict(provider1=10, provider2=10, label1=4, label2=4, - results=[4, 2, 2])), - ('four_nodes_at_quota', - dict(provider1=4, provider2=4, label1=4, label2=4, - results=[4, 0, 4])), - ('four_nodes_over_quota', - dict(provider1=2, provider2=2, label1=4, label2=4, - results=[2, 0, 2])), - ] - - def setUp(self): - super(TwoProvidersTwoLabelsOneShared, self).setUp() - ap1 = allocation.AllocationProvider('provider1', self.provider1) - ap2 = allocation.AllocationProvider('provider2', self.provider2) - at1 = allocation.AllocationTarget('target1') - ar1 = allocation.AllocationRequest('label1', self.label1) - ar2 = allocation.AllocationRequest('label2', self.label2) - ar1.addTarget(at1, 0) - ar2.addTarget(at1, 0) - self.agt.append(ar1.addProvider(ap1, at1)[1]) - self.agt.append(ar2.addProvider(ap1, at1)[1]) - self.agt.append(ar2.addProvider(ap2, at1)[1]) - ap1.makeGrants() - ap2.makeGrants() - - -class RoundRobinAllocation(tests.RoundRobinTestCase, tests.BaseTestCase): - """Test the round-robin behaviour of the AllocationHistory object to - ensure fairness of distribution - - """ - - scenarios = [ - # * one_to_one - # - # test that with only one node available we cycle through the - # available labels. - # - # There's a slight trick with the ordering here; makeGrants() - # algorithm allocates proportionally from the available nodes - # (i.e. if there's allocations for 100 and 50, then the first - # gets twice as many of the available nodes than the second). - # The algorithm is - # - # 1) add up all your peer requests - # 2) calculate your ratio = (your_request / all_peers) - # 3) multiples that ratio by the available nodes - # 4) take the floor() (you can only allocate a whole node) - # - # So we've got 8 total requests, each requesting one node: - # - # label1 = 1/7 other requests = 0.142 * 1 available node = 0 - # label2 = 1/6 other requests = 0.166 * 1 available node = 0 - # label3 = 1/4 other requests = 0.25 * 1 available node = 0 - # ... - # label7 = 1/1 other requests = 1 * 1 available node = 1 - # - # ergo label7 is the first to be granted its request. Thus we - # start the round-robin from there - ('one_to_one', - dict(provider1=1, provider2=0, - label1=1, label2=1, label3=1, label4=1, - label5=1, label6=1, label7=1, label8=1, - results=['label7', - 'label1', - 'label2', - 'label3', - 'label4', - 'label5', - 'label6', - 'label8', - 'label7', - 'label1', - 'label2'])), - - # * at_quota - # - # Test that when at quota, every node gets allocated on every - # round; i.e. nobody ever misses out. odds go to ap1, even to - # ap2 - ('at_quota', - dict(provider1=4, provider2=4, - label1=1, label2=1, label3=1, label4=1, - label5=1, label6=1, label7=1, label8=1, - results=[ - 'label1', 'label3', 'label5', 'label7', - 'label2', 'label4', 'label6', 'label8'] * 11 - )), - - # * big_fish_little_pond - # - # In this test we have one label that far outweighs the other. - # From the description of the ratio allocation above, it can - # swamp the allocation pool and not allow other nodes to come - # online. - # - # Here with two nodes, we check that one node is dedicated to - # the larger label request, but the second node cycles through - # the smaller requests. - ('big_fish_little_pond', - dict(provider1=1, provider2=1, - label1=100, label2=1, label3=1, label4=1, - label5=1, label6=1, label7=1, label8=1, - # provider1 provider2 - results=['label1', 'label1', # round 1 - 'label1', 'label2', # round 2 - 'label1', 'label3', # ... - 'label1', 'label4', - 'label1', 'label5', - 'label1', 'label6', - 'label1', 'label7', - 'label1', 'label8', - 'label1', 'label2', - 'label1', 'label3', - 'label1', 'label4'])), - ] - - def setUp(self): - super(RoundRobinAllocation, self).setUp() - - ah = allocation.AllocationHistory() - - def do_it(): - ap1 = allocation.AllocationProvider('provider1', self.provider1) - ap2 = allocation.AllocationProvider('provider2', self.provider2) - - at1 = allocation.AllocationTarget('target1') - - ars = [] - ars.append(allocation.AllocationRequest('label1', self.label1, ah)) - ars.append(allocation.AllocationRequest('label2', self.label2, ah)) - ars.append(allocation.AllocationRequest('label3', self.label3, ah)) - ars.append(allocation.AllocationRequest('label4', self.label4, ah)) - ars.append(allocation.AllocationRequest('label5', self.label5, ah)) - ars.append(allocation.AllocationRequest('label6', self.label6, ah)) - ars.append(allocation.AllocationRequest('label7', self.label7, ah)) - ars.append(allocation.AllocationRequest('label8', self.label8, ah)) - - # each request to one target, and can be satisfied by both - # providers - for ar in ars: - ar.addTarget(at1, 0) - ar.addProvider(ap1, at1) - ar.addProvider(ap2, at1) - - ap1.makeGrants() - for g in ap1.grants: - self.allocations.append(g.request.name) - ap2.makeGrants() - for g in ap2.grants: - self.allocations.append(g.request.name) - - ah.grantsDone() - - # run the test several times to make sure we bounce around - # enough - for i in range(0, 11): - do_it() - - -class RoundRobinFixedProvider(tests.RoundRobinTestCase, tests.BaseTestCase): - """Test that round-robin behaviour exists when we have a more complex - situation where some nodes can only be provided by some providers - - * label1 is only able to be allocated from provider1 - * label8 is only able to be allocated from provider2 - """ - - scenarios = [ - # * fixed_even - # - # What we see below is an edge case: - # - # Below, label1 always gets chosen because for provider1. - # This is because label1 is requesting 1.0 nodes (it can only - # run on provider1) and all the other labels are requesting - # only 0.5 of a node (they can run on either and no - # allocations have been made yet). We do actually grant in a - # round-robin fashion, but int(0.5) == 0 so no node gets - # allocated. We fall back to the ratio calculation and label1 - # wins. - # - # However, after provider1.makeGrants(), the other labels - # increase their request on the remaning provider2 to their - # full 1.0 nodes. Now the "fight" starts and we allocate in - # the round-robin fashion. - ('fixed_even', - dict(provider1=1, provider2=1, - label1=1, label2=1, label3=1, label4=1, - label5=1, label6=1, label7=1, label8=1, - # provider1 provider2 - results=['label1', 'label6', # round 1 - 'label1', 'label8', # round 2 - 'label1', 'label2', # ... - 'label1', 'label3', - 'label1', 'label4', - 'label1', 'label5', - 'label1', 'label7', - 'label1', 'label6', - 'label1', 'label8', - 'label1', 'label2', - 'label1', 'label3'])), - - # * over_subscribed - # - # In contrast to above, any grant made will be satisfied. We - # see that the fixed node label1 and label8 do not get as full - # a share as the non-fixed nodes -- but they do round-robin - # with the other requests. Fixing this is left as an exercise - # for the reader :) - ('over_subscribed', - dict(provider1=1, provider2=1, - label1=20, label2=20, label3=20, label4=20, - label5=20, label6=20, label7=20, label8=20, - results=['label1', 'label6', - 'label2', 'label8', - 'label3', 'label3', - 'label4', 'label4', - 'label5', 'label5', - 'label7', 'label7', - 'label1', 'label6', - 'label2', 'label8', - 'label3', 'label3', - 'label4', 'label4', - 'label5', 'label5'])), - - # * even - # - # When there's enough nodes to go around, we expect everyone - # to be fully satisifed with label1 on provider1 and label8 - # on provider2 as required - ('even', - dict(provider1=4, provider2=4, - label1=1, label2=1, label3=1, label4=1, - label5=1, label6=1, label7=1, label8=1, - results=[ - 'label1', 'label2', 'label4', 'label6', - 'label8', 'label3', 'label5', 'label7'] * 11))] - - def setUp(self): - super(RoundRobinFixedProvider, self).setUp() - - ah = allocation.AllocationHistory() - - def do_it(): - ap1 = allocation.AllocationProvider('provider1', self.provider1) - ap2 = allocation.AllocationProvider('provider2', self.provider2) - - at1 = allocation.AllocationTarget('target1') - - ars = [] - ars.append(allocation.AllocationRequest('label1', self.label1, ah)) - ars.append(allocation.AllocationRequest('label2', self.label2, ah)) - ars.append(allocation.AllocationRequest('label3', self.label3, ah)) - ars.append(allocation.AllocationRequest('label4', self.label4, ah)) - ars.append(allocation.AllocationRequest('label5', self.label5, ah)) - ars.append(allocation.AllocationRequest('label6', self.label6, ah)) - ars.append(allocation.AllocationRequest('label7', self.label7, ah)) - ars.append(allocation.AllocationRequest('label8', self.label8, ah)) - - # first ar can only go to provider1, the last only to - # provider2 - ars[0].addTarget(at1, 0) - ars[0].addProvider(ap1, at1) - ars[-1].addTarget(at1, 0) - ars[-1].addProvider(ap2, at1) - - # the rest can go anywhere - for ar in ars[1:-1]: - ar.addTarget(at1, 0) - ar.addProvider(ap1, at1) - ar.addProvider(ap2, at1) - - ap1.makeGrants() - for g in ap1.grants: - self.allocations.append(g.request.name) - - ap2.makeGrants() - for g in ap2.grants: - self.allocations.append(g.request.name) - - ah.grantsDone() - - # run the test several times to make sure we bounce around - # enough - for i in range(0, 11): - do_it() - - -def load_tests(loader, in_tests, pattern): - return testscenarios.load_tests_apply_scenarios(loader, in_tests, pattern) From a52d0b4cc9530db17a37d2bdb1d1b703e83a989d Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Fri, 10 Mar 2017 09:24:36 -0800 Subject: [PATCH 098/309] Remove jenkins_manager This is no longer used. Change-Id: Iaf2b5aa96c41e03d5b73254776f2b2814dd0973b --- nodepool/jenkins_manager.py | 137 ------------------------------------ 1 file changed, 137 deletions(-) delete mode 100644 nodepool/jenkins_manager.py diff --git a/nodepool/jenkins_manager.py b/nodepool/jenkins_manager.py deleted file mode 100644 index 92f3e0e4b..000000000 --- a/nodepool/jenkins_manager.py +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/env python - -# Copyright (C) 2011-2013 OpenStack Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import re - -import myjenkins -import fakeprovider -from task_manager import Task, TaskManager - - -class CreateNodeTask(Task): - def main(self, jenkins): - if 'credentials_id' in self.args: - launcher_params = {'port': 22, - 'credentialsId': self.args['credentials_id'], - 'host': self.args['host']} - else: - launcher_params = {'port': 22, - 'username': self.args['username'], - 'privatekey': self.args['private_key'], - 'host': self.args['host']} - args = dict( - name=self.args['name'], - numExecutors=self.args['executors'], - nodeDescription=self.args['description'], - remoteFS=self.args['root'], - exclusive=True, - launcher='hudson.plugins.sshslaves.SSHLauncher', - launcher_params=launcher_params) - if self.args['labels']: - args['labels'] = self.args['labels'] - try: - jenkins.create_node(**args) - except myjenkins.JenkinsException as e: - if 'already exists' in str(e): - pass - else: - raise - - -class NodeExistsTask(Task): - def main(self, jenkins): - return jenkins.node_exists(self.args['name']) - - -class DeleteNodeTask(Task): - def main(self, jenkins): - return jenkins.delete_node(self.args['name']) - - -class GetNodeConfigTask(Task): - def main(self, jenkins): - return jenkins.get_node_config(self.args['name']) - - -class SetNodeConfigTask(Task): - def main(self, jenkins): - jenkins.reconfig_node(self.args['name'], self.args['config']) - - -class StartBuildTask(Task): - def main(self, jenkins): - jenkins.build_job(self.args['name'], - parameters=self.args['params']) - - -class GetInfoTask(Task): - def main(self, jenkins): - return jenkins.get_info() - - -class JenkinsManager(TaskManager): - log = logging.getLogger("nodepool.JenkinsManager") - - def __init__(self, target): - super(JenkinsManager, self).__init__(None, target.name, target.rate) - self.target = target - self._client = self._getClient() - - def _getClient(self): - if self.target.jenkins_apikey == 'fake': - return fakeprovider.FakeJenkins(self.target.jenkins_user) - return myjenkins.Jenkins(self.target.jenkins_url, - self.target.jenkins_user, - self.target.jenkins_apikey) - - def createNode(self, name, host, description, executors, root, labels=[], - credentials_id=None, username=None, private_key=None): - args = dict(name=name, host=host, description=description, - labels=labels, executors=executors, root=root) - if credentials_id: - args['credentials_id'] = credentials_id - else: - args['username'] = username - args['private_key'] = private_key - return self.submitTask(CreateNodeTask(**args)) - - def nodeExists(self, name): - return self.submitTask(NodeExistsTask(name=name)) - - def deleteNode(self, name): - return self.submitTask(DeleteNodeTask(name=name)) - - LABEL_RE = re.compile(r'') - - def relabelNode(self, name, labels): - config = self.submitTask(GetNodeConfigTask(name=name)) - old = None - m = self.LABEL_RE.search(config) - if m: - old = m.group(1) - config = self.LABEL_RE.sub('' % ' '.join(labels), - config) - self.submitTask(SetNodeConfigTask(name=name, config=config)) - return old - - def startBuild(self, name, params): - self.submitTask(StartBuildTask(name=name, params=params)) - - def getInfo(self): - return self._client.get_info() From 1b04cf979f44ffe0f80308855ec2e973b1c961eb Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 28 Feb 2017 15:47:00 -0800 Subject: [PATCH 099/309] Add destructor to SSHClient Newer versions of paramiko require a client object to be explicitly closed. Fortunately, we wrap all of our use of paramiko client objects in our own class. Add a destructor to our class which closes the client object. Note, this has been tested to work (and is needed) even if a connection is not established. Change-Id: I5dff7ed254567968b42d053b85004769f8647ecb (cherry picked from commit d616e61723207ed0e29ea69d67908a00ebf2cdfb) --- nodepool/sshclient.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/nodepool/sshclient.py b/nodepool/sshclient.py index 51faca093..8be0c0089 100644 --- a/nodepool/sshclient.py +++ b/nodepool/sshclient.py @@ -25,14 +25,17 @@ class SSHClient(object): def __init__(self, ip, username, password=None, pkey=None, key_filename=None, log=None, look_for_keys=False, allow_agent=False): - client = paramiko.SSHClient() - client.set_missing_host_key_policy(paramiko.WarningPolicy()) - client.connect(ip, username=username, password=password, pkey=pkey, - key_filename=key_filename, look_for_keys=look_for_keys, - allow_agent=allow_agent) - self.client = client + self.client = paramiko.SSHClient() + self.client.set_missing_host_key_policy(paramiko.WarningPolicy()) + self.client.connect(ip, username=username, password=password, + pkey=pkey, key_filename=key_filename, + look_for_keys=look_for_keys, + allow_agent=allow_agent) self.log = log + def __del__(self): + self.client.close() + def ssh(self, action, command, get_pty=True, output=False): if self.log: self.log.debug("*** START to %s" % action) From 8845584810b69bc12b54aae11554c57662fff97d Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Fri, 10 Mar 2017 09:53:33 -0800 Subject: [PATCH 100/309] Handle exception edge cases in node launching There are some paths where we could end up not logging exceptions, so make sure they are logged. Change-Id: I26b8e4e41981e5abf06e7ba57bccc0cfebf2d247 --- nodepool/nodepool.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index c8ccbe3b7..0c06d140f 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -407,8 +407,10 @@ class NodeLauncher(threading.Thread, StatsReporter): self._launchNode() break except Exception: - self.log.exception("Launch attempt %d/%d failed for node %s:", - attempts, self._retries, self._node.id) + if attempts <= self._retries: + self.log.exception( + "Launch attempt %d/%d failed for node %s:", + attempts, self._retries, self._node.id) # If we created an instance, delete it. if self._node.external_id: self._manager.cleanupServer(self._node.external_id) @@ -432,6 +434,8 @@ class NodeLauncher(threading.Thread, StatsReporter): try: self._run() except Exception as e: + self.log.exception("Launch failed for node %s:", + self._node.id) self._node.state = zk.FAILED self._zk.storeNode(self._node) @@ -440,8 +444,8 @@ class NodeLauncher(threading.Thread, StatsReporter): else: statsd_key = 'error.unknown' - dt = int((time.time() - start_time) * 1000) try: + dt = int((time.time() - start_time) * 1000) self.recordLaunchStats(statsd_key, dt, self._image_name, self._node.provider, self._node.az, self._requestor) From 066942a0ac1a76cacdb9d228c7a6216c6e8acecc Mon Sep 17 00:00:00 2001 From: Monty Taylor Date: Sat, 26 Mar 2016 09:38:33 -0500 Subject: [PATCH 101/309] Stop json-encoding the nodepool metadata When we first started putting nodepool metadata into the server record in OpenStack, we json encoded the data so that we could store a dict into a field that only takes strings. We were also going to teach the ansible OpenStack Inventory about this so that it could read the data out of the groups list. However, ansible was not crazy about accepting "attempt to json decode values in the metadata" since json-encoded values are not actually part of the interface OpenStack expects - which means one of our goals, which is ansible inventory groups based on nodepool information is no longer really a thing. We could push harder on that, but we actually don't need the functionality we're getting from the json encoding. The OpenStack Inventory has supported comma separated lists of groups since before day one. And the other nodepool info we're storing stores and fetches just as easily with 4 different top level keys as it does in a json dict - and is easier to read and deal with when just looking at server records. Finally, nova has a 255 byte limit on size of the value that can be stored, so we cannot grow the information in the nodepool dict indefinitely anyway. Migrate the data to store into nodepool_ variables and a comma separated list for groups. Consume both forms, so that people upgrading will not lose track of existing stock of nodes. Finally, we don't use snapshot_id anymore - so remove it. Change-Id: I2c06dc7c2faa19e27d1fb1d9d6df78da45ffa6dd --- doc/source/operation.rst | 17 +++++++---------- nodepool/nodepool.py | 11 +++++------ nodepool/provider_manager.py | 25 +++++++++++-------------- 3 files changed, 23 insertions(+), 30 deletions(-) diff --git a/doc/source/operation.rst b/doc/source/operation.rst index 800db95a7..a821798a3 100644 --- a/doc/source/operation.rst +++ b/doc/source/operation.rst @@ -77,21 +77,18 @@ When Nodepool creates instances, it will assign the following nova metadata: groups - A json-encoded list containing the name of the image and the name + A comma separated list containing the name of the image and the name of the provider. This may be used by the Ansible OpenStack inventory plugin. - nodepool - A json-encoded dictionary with the following entries: + nodepool_image_name + The name of the image as a string. - image_name - The name of the image as a string. + nodepool_provider_name + The name of the provider as a string. - provider_name - The name of the provider as a string. - - node_id - The nodepool id of the node as an integer. + nodepool_node_id + The nodepool id of the node as an integer. Command Line Tools ------------------ diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index c8ccbe3b7..901d69ed6 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -16,7 +16,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json import logging import os import os.path @@ -1057,15 +1056,15 @@ class NodeCleanupWorker(threading.Thread): known = set([n.external_id for n in zk_conn.nodeIterator() if n.provider == provider.name]) for server in servers: - meta = server.get('metadata', {}).get('nodepool') - if not meta: + meta = server.get('metadata', {}) + + if 'nodepool_provider_name' not in meta: self.log.debug( - "Instance %s (%s) in %s has no nodepool metadata", + "Instance %s (%s) in %s has no nodepool_provider_name", server.name, server.id, provider.name) continue - meta = json.loads(meta) - if meta['provider_name'] != provider.name: + if meta['nodepool_provider_name'] != provider.name: # Another launcher, sharing this provider but configured # with a different name, owns this. continue diff --git a/nodepool/provider_manager.py b/nodepool/provider_manager.py index 0206d720f..217d4b742 100644 --- a/nodepool/provider_manager.py +++ b/nodepool/provider_manager.py @@ -16,7 +16,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json import logging import paramiko from contextlib import contextmanager @@ -186,8 +185,7 @@ class ProviderManager(object): def createServer(self, name, min_ram, image_id=None, image_name=None, az=None, key_name=None, name_filter=None, config_drive=None, nodepool_node_id=None, - nodepool_image_name=None, - nodepool_snapshot_image_id=None): + nodepool_image_name=None): if image_name: image = self.findImage(image_name) else: @@ -218,19 +216,18 @@ class ProviderManager(object): # Also list each of those values directly so that non-ansible # consumption programs don't need to play a game of knowing that # groups[0] is the image name or anything silly like that. - nodepool_meta = dict(provider_name=self.provider.name) - groups_meta = [self.provider.name] - if nodepool_node_id: - nodepool_meta['node_id'] = nodepool_node_id - if nodepool_snapshot_image_id: - nodepool_meta['snapshot_image_id'] = nodepool_snapshot_image_id + groups_list = [self.provider.name] if nodepool_image_name: - nodepool_meta['image_name'] = nodepool_image_name - groups_meta.append(nodepool_image_name) - create_args['meta'] = dict( - groups=json.dumps(groups_meta), - nodepool=json.dumps(nodepool_meta) + groups_list.append(nodepool_image_name) + meta = dict( + groups=",".join(groups_list), + nodepool_provider_name=self.provider.name, ) + if nodepool_node_id: + meta['nodepool_node_id'] = nodepool_node_id + if nodepool_image_name: + meta['nodepool_image_name'] = nodepool_image_name + create_args['meta'] = meta with shade_inner_exceptions(): return self._client.create_server(wait=False, **create_args) From 0eb7fdde1ff670e88c458efd38ceb52b63359b4d Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Fri, 10 Mar 2017 16:39:39 -0500 Subject: [PATCH 102/309] Use node ID for instance leak detection Using the instance external ID creates a race since this value isn't available while the instance is building. This could cause the leak detection code (in its current form) to delete a building instance. Instead, use the node ID we put in the server metadata for the check. Change-Id: I6f417bf90f720ca7ded698a9760dd8feb348e638 --- nodepool/nodepool.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 901d69ed6..cc2485b90 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -273,6 +273,11 @@ class NodeLauncher(threading.Thread, StatsReporter): "for node id: %s" % (hostname, self._provider.name, config_image.name, self._node.id)) + # NOTE: We store the node ID in the server metadata to use for leaked + # instance detection. We cannot use the external server ID for this + # because that isn't available in ZooKeeper until after the server is + # active, which could cause a race in leak detection. + server = self._manager.createServer( hostname, config_image.min_ram, @@ -1047,15 +1052,7 @@ class NodeCleanupWorker(threading.Thread): for provider in self._nodepool.config.providers.values(): manager = self._nodepool.getProviderManager(provider.name) - # NOTE: Cache the servers BEFORE caching the nodes. Doing this in - # the reverse order would create a race where a new server could - # be created just after we cache the list of nodes, thus making it - # incorrectly appear as leaked since we might not have cached the - # node for it. - servers = manager.listServers() - known = set([n.external_id for n in zk_conn.nodeIterator() if n.provider == provider.name]) - - for server in servers: + for server in manager.listServers(): meta = server.get('metadata', {}) if 'nodepool_provider_name' not in meta: @@ -1069,10 +1066,12 @@ class NodeCleanupWorker(threading.Thread): # with a different name, owns this. continue - if server.id not in known: + if not zk_conn.getNode(meta['nodepool_node_id']): self.log.warning( - "Deleting leaked instance %s (%s) in %s", - server.name, server.id, provider.name + "Deleting leaked instance %s (%s) in %s " + "(unknown node id %s)", + server.name, server.id, provider.name, + meta['nodepool_node_id'] ) # Create an artifical node to use for deleting the server. node = zk.Node() From 6c708b655c48e16f47c9f9c9722e88d6fce4cfbf Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Fri, 10 Mar 2017 10:54:48 -0800 Subject: [PATCH 103/309] Fix failure of node assignment at quota The node request handler has a bug when near quota. Adds a test for the bug and changes node request handling to pause the parent ProviderWorker thread through use of a control attribute instead of idling. Co-Authored By: David Shrewsbury Change-Id: I1927fcf087a398524930109616d405ca53984c5f --- nodepool/fakeprovider.py | 15 ++- nodepool/nodepool.py | 153 +++++++++++++++++------- nodepool/tests/__init__.py | 6 +- nodepool/tests/fixtures/node_quota.yaml | 56 +++++++++ nodepool/tests/test_nodepool.py | 79 ++++++++++++ 5 files changed, 262 insertions(+), 47 deletions(-) create mode 100644 nodepool/tests/fixtures/node_quota.yaml diff --git a/nodepool/fakeprovider.py b/nodepool/fakeprovider.py index 5feafe135..c78ab2eb1 100644 --- a/nodepool/fakeprovider.py +++ b/nodepool/fakeprovider.py @@ -75,6 +75,7 @@ class FakeOpenStackCloud(object): log = logging.getLogger("nodepool.FakeOpenStackCloud") def __init__(self, images=None, networks=None): + self.pause_creates = False self._image_list = images if self._image_list is None: self._image_list = [ @@ -151,7 +152,8 @@ class FakeOpenStackCloud(object): metadata=kw.get('meta', {}), manager=self, key_name=kw.get('key_name', None), - should_fail=should_fail) + should_fail=should_fail, + event=threading.Event()) instance_list.append(s) t = threading.Thread(target=self._finish, name='FakeProvider create', @@ -170,7 +172,13 @@ class FakeOpenStackCloud(object): self.log.debug("Deleted from %s" % (repr(instance_list),)) def _finish(self, obj, delay, status): - time.sleep(delay) + self.log.debug("Pause creates %s", self.pause_creates) + if self.pause_creates: + self.log.debug("Pausing") + obj.event.wait() + self.log.debug("Continuing") + else: + time.sleep(delay) obj.status = status def create_image(self, **kwargs): @@ -223,7 +231,8 @@ class FakeOpenStackCloud(object): return result def wait_for_server(self, server, **kwargs): - server.status = 'ACTIVE' + while server.status == 'BUILD': + time.sleep(0.1) return server def list_servers(self): diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index cc2485b90..41ffa321e 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -559,15 +559,26 @@ class NodeRequestHandler(object): :param NodeRequest request: The request to handle. ''' self.log = logging.getLogger("nodepool.NodeRequestHandler") - self.provider = pw.getProviderConfig() - self.zk = pw.getZK() - self.labels = pw.getLabelsConfig() - self.manager = pw.getProviderManager() - self.launcher_id = pw.launcher_id + self.pw = pw self.request = request self.launch_manager = None self.nodeset = [] self.done = False + self.chosen_az = None + self.paused = False + + def _setFromProviderWorker(self): + ''' + Set values that we pull from the parent ProviderWorker. + + We don't do this in __init__ because this class is re-entrant and we + want the updated values. + ''' + self.provider = self.pw.getProviderConfig() + self.zk = self.pw.getZK() + self.labels = self.pw.getLabelsConfig() + self.manager = self.pw.getProviderManager() + self.launcher_id = self.pw.launcher_id def _imagesAvailable(self): ''' @@ -601,20 +612,6 @@ class NodeRequestHandler(object): count += 1 return count - def _unlockNodeSet(self): - ''' - Attempt unlocking all Nodes in the object node set. - ''' - for node in self.nodeset: - if not node.lock: - continue - try: - self.zk.unlockNode(node) - except Exception: - self.log.exception("Error unlocking node:") - self.log.debug("Unlocked node %s for request %s", - node.id, self.request.id) - def _waitForNodeSet(self): ''' Fill node set for the request. @@ -639,12 +636,12 @@ class NodeRequestHandler(object): launcher has already started doing so. This would cause an expected failure from the underlying library, which is ok for now. ''' - self.launch_manager = NodeLaunchManager( - self.zk, self.provider, self.labels, self.manager, - self.request.requestor, retries=self.provider.launch_retries) + if not self.launch_manager: + self.launch_manager = NodeLaunchManager( + self.zk, self.provider, self.labels, self.manager, + self.request.requestor, retries=self.provider.launch_retries) ready_nodes = self.zk.getReadyNodesOfTypes(self.request.node_types) - chosen_az = None for ntype in self.request.node_types: # First try to grab from the list of already available nodes. @@ -655,7 +652,7 @@ class NodeRequestHandler(object): # the selected AZ. if node.provider != self.provider.name: continue - if chosen_az and node.az != chosen_az: + if self.chosen_az and node.az != self.chosen_az: continue try: @@ -675,34 +672,37 @@ class NodeRequestHandler(object): # If we haven't already chosen an AZ, select the # AZ from this ready node. This will cause new nodes # to share this AZ, as well. - if not chosen_az and node.az: - chosen_az = node.az + if not self.chosen_az and node.az: + self.chosen_az = node.az break # Could not grab an existing node, so launch a new one. if not got_a_node: # Select grouping AZ if we didn't set AZ from a selected, # pre-existing node - if not chosen_az and self.provider.azs: - chosen_az = random.choice(self.provider.azs) - - logged = False + if not self.chosen_az and self.provider.azs: + self.chosen_az = random.choice(self.provider.azs) # If we calculate that we're at capacity, pause until nodes # are released by Zuul and removed by the NodeCleanupWorker. - while self._countNodes() >= self.provider.max_servers: - if not logged: + if self._countNodes() >= self.provider.max_servers: + self.paused = True + if not self.pw.paused: self.log.debug( "Pausing request handling to satisfy request %s", self.request) - logged = True - time.sleep(1) + self.pw.paused = True + return + + if self.paused: + self.log.debug("Unpaused request %s", self.request) + self.paused = False node = zk.Node() node.state = zk.INIT node.type = ntype node.provider = self.provider.name - node.az = chosen_az + node.az = self.chosen_az node.launcher = self.launcher_id node.allocated_to = self.request.id @@ -727,6 +727,8 @@ class NodeRequestHandler(object): ''' Main body for the NodeRequestHandler. ''' + self._setFromProviderWorker() + declined_reasons = [] if not self._imagesAvailable(): declined_reasons.append('images are not available') @@ -742,25 +744,67 @@ class NodeRequestHandler(object): self.request.id) # All launchers have declined it self.request.state = zk.FAILED + self.unlockNodeSet(clear_allocation=True) self.zk.storeNodeRequest(self.request) self.zk.unlockNodeRequest(self.request) self.done = True return - self.log.debug("Accepting node request %s", self.request.id) - self.request.state = zk.PENDING - self.zk.storeNodeRequest(self.request) + if self.paused: + self.log.debug("Retrying node request %s", self.request.id) + else: + self.log.debug("Accepting node request %s", self.request.id) + self.request.state = zk.PENDING + self.zk.storeNodeRequest(self.request) + self._waitForNodeSet() @property def alive_thread_count(self): return self.launch_manager.alive_thread_count + #---------------------------------------------------------------- + # Public methods + #---------------------------------------------------------------- + + def unlockNodeSet(self, clear_allocation=False): + ''' + Attempt unlocking all Nodes in the node set. + + :param bool clear_allocation: If true, clears the node allocated_to + attribute. + ''' + for node in self.nodeset: + if not node.lock: + continue + + if clear_allocation: + node.allocated_to = None + self.zk.storeNode(node) + + try: + self.zk.unlockNode(node) + except Exception: + self.log.exception("Error unlocking node:") + self.log.debug("Unlocked node %s for request %s", + node.id, self.request.id) + + self.nodeset = [] + def run(self): + ''' + Execute node request handling. + + This code is designed to be re-entrant. Because we can't always + satisfy a request immediately (due to lack of provider resources), we + need to be able to call run() repeatedly until the request can be + fulfilled. The node set is saved and added to between calls. + ''' try: self._run() except Exception: self.log.exception("Exception in NodeRequestHandler:") + self.unlockNodeSet(clear_allocation=True) self.request.state = zk.FAILED self.zk.storeNodeRequest(self.request) self.zk.unlockNodeRequest(self.request) @@ -789,7 +833,7 @@ class NodeRequestHandler(object): for node in self.nodeset: node.allocated_to = None self.zk.storeNode(node) - self._unlockNodeSet() + self.unlockNodeSet() return True if self.launch_manager.failed_nodes: @@ -813,7 +857,7 @@ class NodeRequestHandler(object): self.request.id) self.request.state = zk.FULFILLED - self._unlockNodeSet() + self.unlockNodeSet() self.zk.storeNodeRequest(self.request) self.zk.unlockNodeRequest(self.request) return True @@ -837,6 +881,7 @@ class ProviderWorker(threading.Thread): self.nodepool = nodepool self.provider_name = provider_name self.running = False + self.paused = False self.request_handlers = [] self.watermark_sleep = nodepool.watermark_sleep self.zk = self.getZK() @@ -874,6 +919,9 @@ class ProviderWorker(threading.Thread): return for req_id in self.zk.getNodeRequests(): + if self.paused: + return + # Short-circuit for limited request handling if (provider.max_concurrency > 0 and self._activeThreads() >= provider.max_concurrency @@ -915,7 +963,7 @@ class ProviderWorker(threading.Thread): ''' active_handlers = [] for r in self.request_handlers: - if not r.poll(): + if r.paused or not r.poll(): active_handlers.append(r) self.request_handlers = active_handlers @@ -948,12 +996,33 @@ class ProviderWorker(threading.Thread): self.zk.registerLauncher(self.launcher_id) try: - self._assignHandlers() + if not self.paused: + self._assignHandlers() + else: + # If we are paused, one request handler could not satisify + # its assigned request, so we need to find it and give it + # another shot (there can be only 1). Unpause ourselves if + # it completed. + completed = True + for handler in self.request_handlers: + if handler.paused: + self.log.debug("Re-run handler %s", handler) + handler.run() + completed = False + break + if completed: + self.paused = False + self._removeCompletedHandlers() except Exception: self.log.exception("Error in ProviderWorker:") time.sleep(self.watermark_sleep) + # Cleanup on exit + if self.paused: + for handler in self.request_handlers: + handler.unlockNodeSet(clear_allocation=True) + def stop(self): ''' Shutdown the ProviderWorker thread. diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py index 0d9a959bd..8317c36d5 100644 --- a/nodepool/tests/__init__.py +++ b/nodepool/tests/__init__.py @@ -462,13 +462,15 @@ class DBTestCase(BaseTestCase): self.wait_for_threads() return ready_nodes[label] - def waitForNodeRequest(self, req): + def waitForNodeRequest(self, req, states=None): ''' Wait for a node request to transition to a final state. ''' + if states is None: + states = (zk.FULFILLED, zk.FAILED) while True: req = self.zk.getNodeRequest(req.id) - if req.state in (zk.FULFILLED, zk.FAILED): + if req.state in states: break time.sleep(1) diff --git a/nodepool/tests/fixtures/node_quota.yaml b/nodepool/tests/fixtures/node_quota.yaml new file mode 100644 index 000000000..01a41cd29 --- /dev/null +++ b/nodepool/tests/fixtures/node_quota.yaml @@ -0,0 +1,56 @@ +elements-dir: . +images-dir: '{images_dir}' + +cron: + check: '*/15 * * * *' + cleanup: '*/1 * * * *' + +zookeeper-servers: + - host: {zookeeper_host} + port: {zookeeper_port} + chroot: {zookeeper_chroot} + +labels: + - name: fake-label + image: fake-image + min-ready: 0 + providers: + - name: fake-provider + +providers: + - name: fake-provider + region-name: fake-region + availability-zones: + - az1 + keypair: 'if-present-use-this-keypair' + username: 'fake' + password: 'fake' + auth-url: 'fake' + project-id: 'fake' + max-servers: 2 + pool: 'fake' + networks: + - net-id: 'some-uuid' + rate: 0.0001 + images: + - name: fake-image + min-ram: 8192 + name-filter: 'Fake' + meta: + key: value + key2: value + +targets: + - name: fake-target + +diskimages: + - name: fake-image + elements: + - fedora + - vm + release: 21 + env-vars: + TMPDIR: /opt/dib_tmp + DIB_IMAGE_CACHE: /opt/dib_cache + DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/ + BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2 diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index 2789243c4..19de6693f 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -74,6 +74,85 @@ class TestNodepool(tests.DBTestCase): self.zk.deleteNodeRequest(req) self.waitForNodeRequestLockDeletion(req.id) + def test_node_assignment_at_quota(self): + ''' + Successful node launch should have unlocked nodes in READY state + and assigned to the request. + ''' + configfile = self.setup_config('node_quota.yaml') + self._useBuilder(configfile) + self.waitForImage('fake-provider', 'fake-image') + + nodepool.nodepool.LOCK_CLEANUP = 1 + pool = self.useNodepool(configfile, watermark_sleep=1) + pool.start() + self.wait_for_config(pool) + + client = pool.getProviderManager('fake-provider')._getClient() + + # One of the things we want to test is that if spawn many node + # launches at once, we do not deadlock while the request + # handler pauses for quota. To ensure we test that case, + # pause server creation until we have accepted all of the node + # requests we submit. This will ensure that we hold locks on + # all of the nodes before pausing so that we can validate they + # are released. + client.pause_creates = True + + req1 = zk.NodeRequest() + req1.state = zk.REQUESTED + req1.node_types.append('fake-label') + req1.node_types.append('fake-label') + self.zk.storeNodeRequest(req1) + req2 = zk.NodeRequest() + req2.state = zk.REQUESTED + req2.node_types.append('fake-label') + req2.node_types.append('fake-label') + self.zk.storeNodeRequest(req2) + + req1 = self.waitForNodeRequest(req1, (zk.PENDING,)) + req2 = self.waitForNodeRequest(req2, (zk.PENDING,)) + + # At this point, we should be about to create or have already + # created two servers for the first request, and the request + # handler has accepted the second node request but paused + # waiting for the server count to go below quota. + + # Wait until both of the servers exist. + while len(client._server_list) < 2: + time.sleep(0.1) + + # Allow the servers to finish being created. + for server in client._server_list: + server.event.set() + + self.log.debug("Waiting for 1st request %s", req1.id) + req1 = self.waitForNodeRequest(req1) + self.assertEqual(req1.state, zk.FULFILLED) + self.assertEqual(len(req1.nodes), 2) + + # Mark the first request's nodes as USED, which will get them deleted + # and allow the second to proceed. + self.log.debug("Deleting 1st request %s", req1.id) + for node_id in req1.nodes: + node = self.zk.getNode(node_id) + node.state = zk.USED + self.zk.storeNode(node) + self.zk.deleteNodeRequest(req1) + self.waitForNodeRequestLockDeletion(req1.id) + + # Wait until both of the servers exist. + while len(client._server_list) < 2: + time.sleep(0.1) + + # Allow the servers to finish being created. + for server in client._server_list: + server.event.set() + + req2 = self.waitForNodeRequest(req2) + self.assertEqual(req2.state, zk.FULFILLED) + self.assertEqual(len(req2.nodes), 2) + def test_fail_request_on_launch_failure(self): ''' Test that provider launch error fails the request. From b65f4bb9748d14a84ec429f523b738b02fe723f5 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Fri, 10 Mar 2017 14:30:27 -0800 Subject: [PATCH 104/309] Store a pointer to the paused node request handler Rather than looping through, let's say, 500 request handlers looking for the one that is paused, which in the current implementation will almost certainly be the last in the list, just store a pointer to the one which is paused. Change-Id: Ia26345f339297d7d48d93989d3fc7425d6e5e83f --- nodepool/nodepool.py | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index 41ffa321e..d637d7bdd 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -686,12 +686,11 @@ class NodeRequestHandler(object): # If we calculate that we're at capacity, pause until nodes # are released by Zuul and removed by the NodeCleanupWorker. if self._countNodes() >= self.provider.max_servers: - self.paused = True - if not self.pw.paused: + if not self.paused: self.log.debug( "Pausing request handling to satisfy request %s", - self.request) - self.pw.paused = True + self.request) + self.paused = True return if self.paused: @@ -820,6 +819,9 @@ class NodeRequestHandler(object): :returns: True if we are done with the request, False otherwise. ''' + if self.paused: + return False + if self.done: return True @@ -881,7 +883,7 @@ class ProviderWorker(threading.Thread): self.nodepool = nodepool self.provider_name = provider_name self.running = False - self.paused = False + self.paused_handler = None self.request_handlers = [] self.watermark_sleep = nodepool.watermark_sleep self.zk = self.getZK() @@ -919,7 +921,7 @@ class ProviderWorker(threading.Thread): return for req_id in self.zk.getNodeRequests(): - if self.paused: + if self.paused_handler: return # Short-circuit for limited request handling @@ -955,6 +957,8 @@ class ProviderWorker(threading.Thread): self.log.info("Assigning node request %s" % req) rh = NodeRequestHandler(self, req) rh.run() + if rh.paused: + self.paused_handler = rh self.request_handlers.append(rh) def _removeCompletedHandlers(self): @@ -963,7 +967,7 @@ class ProviderWorker(threading.Thread): ''' active_handlers = [] for r in self.request_handlers: - if r.paused or not r.poll(): + if not r.poll(): active_handlers.append(r) self.request_handlers = active_handlers @@ -996,22 +1000,15 @@ class ProviderWorker(threading.Thread): self.zk.registerLauncher(self.launcher_id) try: - if not self.paused: + if not self.paused_handler: self._assignHandlers() else: - # If we are paused, one request handler could not satisify - # its assigned request, so we need to find it and give it - # another shot (there can be only 1). Unpause ourselves if - # it completed. - completed = True - for handler in self.request_handlers: - if handler.paused: - self.log.debug("Re-run handler %s", handler) - handler.run() - completed = False - break - if completed: - self.paused = False + # If we are paused, one request handler could not + # satisify its assigned request, so give it + # another shot. Unpause ourselves if it completed. + self.paused_handler.run() + if not self.paused_handler.paused: + self.paused_handler = None self._removeCompletedHandlers() except Exception: @@ -1019,9 +1016,8 @@ class ProviderWorker(threading.Thread): time.sleep(self.watermark_sleep) # Cleanup on exit - if self.paused: - for handler in self.request_handlers: - handler.unlockNodeSet(clear_allocation=True) + if self.paused_handler: + self.paused_handler.unlockNodeSet(clear_allocation=True) def stop(self): ''' From 1115a3a8fce31a36832dbcf9354c5e985ef88a3a Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Sun, 12 Mar 2017 07:57:02 -0400 Subject: [PATCH 105/309] Re-enable test_disabled_label Re-enable this test, and correct the docstring since min-ready doesn't affect an image being created, only nodes. Change-Id: I4756939db6649edeb4dba567a09e6eb772fb6e9d --- nodepool/tests/test_nodepool.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index 19de6693f..d42b27303 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -207,23 +207,15 @@ class TestNodepool(tests.DBTestCase): self.assertEqual(nodes[0].provider, 'fake-provider') self.assertEqual(nodes[0].type, 'fake-label') - - @skip("Disabled for early v3 development") def test_disabled_label(self): - """Test that an image and node are not created""" + """Test that a node is not created with min-ready=0""" configfile = self.setup_config('node_disabled_label.yaml') pool = self.useNodepool(configfile, watermark_sleep=1) self._useBuilder(configfile) pool.start() self.waitForImage('fake-provider', 'fake-image') - self.waitForNodes(pool) - - with pool.getDB().getSession() as session: - nodes = session.getNodes(provider_name='fake-provider', - label_name='fake-label', - target_name='fake-target', - state=nodedb.READY) - self.assertEqual(len(nodes), 0) + self.assertEqual([], self.zk.getNodeRequests()) + self.assertEqual([], self.zk.getNodes()) def test_node_net_name(self): """Test that a node is created with a net name""" From 8fc83ab818e7f6ac682246a7cd002ab08576847c Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Sun, 12 Mar 2017 08:01:10 -0400 Subject: [PATCH 106/309] Re-enable test_node_az Change-Id: I942ce1b0ac14a0cad7c4ec7133e0d34125cec888 --- nodepool/tests/test_nodepool.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index d42b27303..dcf16b0ae 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -274,7 +274,6 @@ class TestNodepool(tests.DBTestCase): self.assertEqual(nodes[1].provider, 'fake-provider2') self.assertEqual(nodes[1].type, 'fake-label') - @skip("Disabled for early v3 development") def test_node_az(self): """Test that an image and node are created with az specified""" configfile = self.setup_config('node_az.yaml') @@ -282,15 +281,10 @@ class TestNodepool(tests.DBTestCase): self._useBuilder(configfile) pool.start() self.waitForImage('fake-provider', 'fake-image') - self.waitForNodes(pool) - - with pool.getDB().getSession() as session: - nodes = session.getNodes(provider_name='fake-provider', - label_name='fake-label', - target_name='fake-target', - state=nodedb.READY) - self.assertEqual(len(nodes), 1) - self.assertEqual(nodes[0].az, 'az1') + nodes = self.waitForNodes('fake-label') + self.assertEqual(len(nodes), 1) + self.assertEqual(nodes[0].provider, 'fake-provider') + self.assertEqual(nodes[0].az, 'az1') @skip("Disabled for early v3 development") def test_node_ipv6(self): From b80f03ce2030cc02238c88004eec1ee9d8c9cf42 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Sun, 12 Mar 2017 08:53:25 -0400 Subject: [PATCH 107/309] Fix provider-label association Providers should not build node types for which they are not configured. We were not checking to see if a provider was listed within a label definition before building the node. This caused ANY provider to build a node of that type. Change-Id: I3538b7ced7452c15e2309bc4253e6c13d4c83b84 --- nodepool/nodepool.py | 22 ++++++ .../tests/fixtures/node_label_provider.yaml | 73 +++++++++++++++++++ nodepool/tests/test_nodepool.py | 12 +++ 3 files changed, 107 insertions(+) create mode 100644 nodepool/tests/fixtures/node_label_provider.yaml diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index d637d7bdd..e9d8cc41b 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -600,6 +600,23 @@ class NodeRequestHandler(object): return False return True + def _invalidNodeTypes(self): + ''' + Return any node types that are invalid for this provider. + + :returns: A list of node type names that are invalid, or an empty + list if all are valid. + ''' + invalid = [] + for ntype in self.request.node_types: + if ntype not in self.labels: + invalid.append(ntype) + else: + label = self.labels[ntype] + if self.provider.name not in label.providers.keys(): + invalid.append(ntype) + return invalid + def _countNodes(self): ''' Query ZooKeeper to determine the number of provider nodes launched. @@ -733,6 +750,11 @@ class NodeRequestHandler(object): declined_reasons.append('images are not available') if len(self.request.node_types) > self.provider.max_servers: declined_reasons.append('it would exceed quota') + invalid_types = self._invalidNodeTypes() + if invalid_types: + declined_reasons.append('node type(s) [%s] not available' % + ','.join(invalid_types)) + if declined_reasons: self.log.debug("Declining node request %s because %s", self.request.id, ', '.join(declined_reasons)) diff --git a/nodepool/tests/fixtures/node_label_provider.yaml b/nodepool/tests/fixtures/node_label_provider.yaml new file mode 100644 index 000000000..e9c4c5ee8 --- /dev/null +++ b/nodepool/tests/fixtures/node_label_provider.yaml @@ -0,0 +1,73 @@ +elements-dir: . +images-dir: '{images_dir}' + +cron: + check: '*/15 * * * *' + cleanup: '*/1 * * * *' + +zookeeper-servers: + - host: {zookeeper_host} + port: {zookeeper_port} + chroot: {zookeeper_chroot} + +labels: + - name: fake-label + image: fake-image + min-ready: 1 + providers: + - name: fake-provider2 + +providers: + - name: fake-provider + region-name: fake-region + keypair: 'if-present-use-this-keypair' + username: 'fake' + password: 'fake' + auth-url: 'fake' + project-id: 'fake' + max-servers: 96 + pool: 'fake' + networks: + - net-id: 'some-uuid' + rate: 0.0001 + images: + - name: fake-image + min-ram: 8192 + name-filter: 'Fake' + meta: + key: value + key2: value + - name: fake-provider2 + region-name: fake-region + keypair: 'if-present-use-this-keypair' + username: 'fake' + password: 'fake' + auth-url: 'fake' + project-id: 'fake' + max-servers: 96 + pool: 'fake' + networks: + - net-id: 'some-uuid' + rate: 0.0001 + images: + - name: fake-image + min-ram: 8192 + name-filter: 'Fake' + meta: + key: value + key2: value + +targets: + - name: fake-target + +diskimages: + - name: fake-image + elements: + - fedora + - vm + release: 21 + env-vars: + TMPDIR: /opt/dib_tmp + DIB_IMAGE_CACHE: /opt/dib_cache + DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/ + BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2 diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index dcf16b0ae..3a56746dc 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -666,3 +666,15 @@ class TestNodepool(tests.DBTestCase): with pool.getDB().getSession() as session: node = session.getNode(2) self.assertEqual(node, None) + + def test_label_provider(self): + """Test that only providers listed in the label satisfy the request""" + configfile = self.setup_config('node_label_provider.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + self._useBuilder(configfile) + pool.start() + self.waitForImage('fake-provider', 'fake-image') + self.waitForImage('fake-provider2', 'fake-image') + nodes = self.waitForNodes('fake-label') + self.assertEqual(len(nodes), 1) + self.assertEqual(nodes[0].provider, 'fake-provider2') From ee03dda479d899da6699b28a00ab8f2cddf2d149 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Sun, 12 Mar 2017 08:57:57 -0400 Subject: [PATCH 108/309] Re-enable test_node_ipv6 Note: This exposed the provider-label association bug, so it's also a good test for that, too. Change-Id: Ia21bc148ad895b7b54bcb9f661928e025e64ed5d --- nodepool/tests/test_nodepool.py | 45 +++++++++++++++------------------ 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index 3a56746dc..8ed80ac5f 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -286,7 +286,6 @@ class TestNodepool(tests.DBTestCase): self.assertEqual(nodes[0].provider, 'fake-provider') self.assertEqual(nodes[0].az, 'az1') - @skip("Disabled for early v3 development") def test_node_ipv6(self): """Test that a node is created w/ or w/o ipv6 preferred flag""" configfile = self.setup_config('node_ipv6.yaml') @@ -296,30 +295,28 @@ class TestNodepool(tests.DBTestCase): self.waitForImage('fake-provider1', 'fake-image') self.waitForImage('fake-provider2', 'fake-image') self.waitForImage('fake-provider3', 'fake-image') - self.waitForNodes(pool) + label1_nodes = self.waitForNodes('fake-label1') + label2_nodes = self.waitForNodes('fake-label2') + label3_nodes = self.waitForNodes('fake-label3') - with pool.getDB().getSession() as session: - # ipv6 preferred set to true and ipv6 address available - nodes = session.getNodes(provider_name='fake-provider1', - label_name='fake-label1', - target_name='fake-target', - state=nodedb.READY) - self.assertEqual(len(nodes), 1) - self.assertEqual(nodes[0].ip, 'fake_v6') - # ipv6 preferred unspecified and ipv6 address available - nodes = session.getNodes(provider_name='fake-provider2', - label_name='fake-label2', - target_name='fake-target', - state=nodedb.READY) - self.assertEqual(len(nodes), 1) - self.assertEqual(nodes[0].ip, 'fake') - # ipv6 preferred set to true but ipv6 address unavailable - nodes = session.getNodes(provider_name='fake-provider3', - label_name='fake-label3', - target_name='fake-target', - state=nodedb.READY) - self.assertEqual(len(nodes), 1) - self.assertEqual(nodes[0].ip, 'fake') + self.assertEqual(len(label1_nodes), 1) + self.assertEqual(len(label2_nodes), 1) + self.assertEqual(len(label3_nodes), 1) + + # ipv6 preferred set to true and ipv6 address available + self.assertEqual(label1_nodes[0].provider, 'fake-provider1') + self.assertEqual(label1_nodes[0].public_ipv4, 'fake') + self.assertEqual(label1_nodes[0].public_ipv6, 'fake_v6') + + # ipv6 preferred unspecified and ipv6 address available + self.assertEqual(label2_nodes[0].provider, 'fake-provider2') + self.assertEqual(label2_nodes[0].public_ipv4, 'fake') + self.assertEqual(label2_nodes[0].public_ipv6, 'fake_v6') + + # ipv6 preferred set to true but ipv6 address unavailable + self.assertEqual(label3_nodes[0].provider, 'fake-provider3') + self.assertEqual(label3_nodes[0].public_ipv4, 'fake') + self.assertEqual(label3_nodes[0].public_ipv6, '') def test_node_delete_success(self): configfile = self.setup_config('node.yaml') From 28405cf1fa54b9574b3b64e2e3ea720d8b4b3ad8 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Sun, 12 Mar 2017 09:01:39 -0400 Subject: [PATCH 109/309] Remove test_nodepool.test_job_* tests These are invalid in the ZuulV3 world. Change-Id: Ib66ad07005463b06e29f552b454161fc79bf8577 --- nodepool/tests/test_nodepool.py | 125 -------------------------------- 1 file changed, 125 deletions(-) diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index 8ed80ac5f..c427fa060 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json import logging import time from unittest import skip @@ -540,130 +539,6 @@ class TestNodepool(tests.DBTestCase): # should be second image built. self.assertEqual(images[0].id, 2) - @skip("Disabled for early v3 development") - def test_job_start_event(self): - """Test that job start marks node used""" - configfile = self.setup_config('node.yaml') - pool = self.useNodepool(configfile, watermark_sleep=1) - self._useBuilder(configfile) - pool.start() - self.waitForImage('fake-provider', 'fake-image') - self.waitForNodes(pool) - - msg_obj = {'name': 'fake-job', - 'build': {'node_name': 'fake-label-fake-provider-1'}} - json_string = json.dumps(msg_obj) - handler = nodepool.nodepool.NodeUpdateListener(pool, - 'tcp://localhost:8881') - handler.handleEvent('onStarted', json_string) - self.wait_for_threads() - - with pool.getDB().getSession() as session: - nodes = session.getNodes(provider_name='fake-provider', - label_name='fake-label', - target_name='fake-target', - state=nodedb.USED) - self.assertEqual(len(nodes), 1) - - @skip("Disabled for early v3 development") - def test_job_end_event(self): - """Test that job end marks node delete""" - configfile = self.setup_config('node.yaml') - pool = self.useNodepool(configfile, watermark_sleep=1) - self._useBuilder(configfile) - pool.start() - self.waitForImage('fake-provider', 'fake-image') - self.waitForNodes(pool) - - msg_obj = {'name': 'fake-job', - 'build': {'node_name': 'fake-label-fake-provider-1', - 'status': 'SUCCESS'}} - json_string = json.dumps(msg_obj) - # Don't delay when deleting. - self.useFixture(fixtures.MonkeyPatch( - 'nodepool.nodepool.DELETE_DELAY', - 0)) - handler = nodepool.nodepool.NodeUpdateListener(pool, - 'tcp://localhost:8881') - handler.handleEvent('onFinalized', json_string) - self.wait_for_threads() - - with pool.getDB().getSession() as session: - node = session.getNode(1) - self.assertEqual(node, None) - - @skip("Disabled for early v3 development") - def _test_job_auto_hold(self, result): - configfile = self.setup_config('node.yaml') - pool = self.useNodepool(configfile, watermark_sleep=1) - self._useBuilder(configfile) - pool.start() - - self.waitForImage('fake-provider', 'fake-image') - self.waitForNodes(pool) - - with pool.getDB().getSession() as session: - session.createJob('fake-job', hold_on_failure=1) - - msg_obj = {'name': 'fake-job', - 'build': {'node_name': 'fake-label-fake-provider-1', - 'status': result}} - json_string = json.dumps(msg_obj) - # Don't delay when deleting. - self.useFixture(fixtures.MonkeyPatch( - 'nodepool.nodepool.DELETE_DELAY', - 0)) - handler = nodepool.nodepool.NodeUpdateListener(pool, - 'tcp://localhost:8881') - handler.handleEvent('onFinalized', json_string) - self.wait_for_threads() - return pool - - @skip("Disabled for early v3 development") - def test_job_auto_hold_success(self): - """Test that a successful job does not hold a node""" - pool = self._test_job_auto_hold('SUCCESS') - with pool.getDB().getSession() as session: - node = session.getNode(1) - self.assertIsNone(node) - - @skip("Disabled for early v3 development") - def test_job_auto_hold_failure(self): - """Test that a failed job automatically holds a node""" - pool = self._test_job_auto_hold('FAILURE') - with pool.getDB().getSession() as session: - node = session.getNode(1) - self.assertEqual(node.state, nodedb.HOLD) - - @skip("Disabled for early v3 development") - def test_job_auto_hold_failure_max(self): - """Test that a failed job automatically holds only one node""" - pool = self._test_job_auto_hold('FAILURE') - with pool.getDB().getSession() as session: - node = session.getNode(1) - self.assertEqual(node.state, nodedb.HOLD) - - # Wait for a replacement node - self.waitForNodes(pool) - with pool.getDB().getSession() as session: - node = session.getNode(2) - self.assertEqual(node.state, nodedb.READY) - - # Fail the job again - msg_obj = {'name': 'fake-job', - 'build': {'node_name': 'fake-label-fake-provider-2', - 'status': 'FAILURE'}} - json_string = json.dumps(msg_obj) - handler = nodepool.nodepool.NodeUpdateListener(pool, - 'tcp://localhost:8881') - handler.handleEvent('onFinalized', json_string) - self.wait_for_threads() - - # Ensure that the second node was deleted - with pool.getDB().getSession() as session: - node = session.getNode(2) - self.assertEqual(node, None) - def test_label_provider(self): """Test that only providers listed in the label satisfy the request""" configfile = self.setup_config('node_label_provider.yaml') From 8c6461ebe439ea5178fa3cac680bccc96f00deac Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Sun, 12 Mar 2017 10:21:03 -0400 Subject: [PATCH 110/309] Remove Jenkins Remove files and fakes related to Jenkins. Since the 'targets' config section was for mapping to Jenkins, this isn't needed either. Change-Id: Ib5c615a95fcdce5234b3c63957171d77b8fbc65d --- devstack/plugin.sh | 5 - doc/source/configuration.rst | 84 +---------- nodepool/cmd/config_validator.py | 12 -- nodepool/config.py | 33 ----- nodepool/fakeprovider.py | 46 ------ nodepool/jenkins_manager.py | 137 ------------------ nodepool/myjenkins.py | 136 ----------------- .../tests/fixtures/config_validate/good.yaml | 3 - .../fixtures/config_validate/yaml_error.yaml | 3 - nodepool/tests/fixtures/integration.yaml | 7 - nodepool/tests/fixtures/integration_occ.yaml | 3 - nodepool/tests/fixtures/leaked_node.yaml | 3 - nodepool/tests/fixtures/node.yaml | 3 - nodepool/tests/fixtures/node_az.yaml | 3 - nodepool/tests/fixtures/node_cmd.yaml | 3 - .../tests/fixtures/node_disabled_label.yaml | 3 - .../tests/fixtures/node_diskimage_fail.yaml | 3 - .../tests/fixtures/node_diskimage_only.yaml | 2 - .../tests/fixtures/node_diskimage_pause.yaml | 3 - .../fixtures/node_image_upload_pause.yaml | 3 - nodepool/tests/fixtures/node_ipv6.yaml | 3 - .../tests/fixtures/node_label_provider.yaml | 3 - .../tests/fixtures/node_launch_retry.yaml | 3 - nodepool/tests/fixtures/node_net_name.yaml | 3 - nodepool/tests/fixtures/node_quota.yaml | 3 - nodepool/tests/fixtures/node_two_image.yaml | 3 - .../tests/fixtures/node_two_image_remove.yaml | 3 - .../tests/fixtures/node_two_provider.yaml | 3 - .../fixtures/node_two_provider_remove.yaml | 3 - nodepool/tests/fixtures/node_upload_fail.yaml | 3 - nodepool/tests/fixtures/node_vhd.yaml | 3 - .../tests/fixtures/node_vhd_and_qcow2.yaml | 3 - nodepool/tests/fixtures/secure.conf | 6 - requirements.txt | 1 - tools/fake-dib.yaml | 3 - tools/fake-secure.conf | 6 - tools/fake.yaml | 3 - 37 files changed, 5 insertions(+), 545 deletions(-) delete mode 100644 nodepool/jenkins_manager.py delete mode 100644 nodepool/myjenkins.py diff --git a/devstack/plugin.sh b/devstack/plugin.sh index d2f5528c6..7149208e0 100644 --- a/devstack/plugin.sh +++ b/devstack/plugin.sh @@ -184,11 +184,6 @@ zookeeper-servers: - host: localhost port: 2181 -# Need to have at least one target for node allocations, but -# this does not need to be a jenkins target. -targets: - - name: dummy - cron: cleanup: '*/1 * * * *' check: '*/15 * * * *' diff --git a/doc/source/configuration.rst b/doc/source/configuration.rst index 82a5b0016..e41d4a80f 100644 --- a/doc/source/configuration.rst +++ b/doc/source/configuration.rst @@ -5,18 +5,11 @@ Configuration Nodepool reads its secure configuration from ``/etc/nodepool/secure.conf`` by default. The secure file is a standard ini config file, with -one section for database, and another section for the jenkins -secrets for each target:: +one section for the database. [database] dburi={dburi} - [jenkins "{target_name}"] - user={user} - apikey={apikey} - credentials={credentials} - url={url} - Following settings are available:: **required** @@ -29,36 +22,11 @@ Following settings are available:: dburi='mysql+pymysql://nodepool@localhost/nodepool' -**optional** - - While it is possible to run Nodepool without any Jenkins targets, - if Jenkins is used, the `target_name` and `url` are required. The - `user`, `apikey` and `credentials` also may be needed depending on - the Jenkins security settings. - - ``target_name`` - Name of the jenkins target. It needs to match with a target - specified in nodepool.yaml, in order to retrieve its settings. - - ``url`` - Url to the Jenkins REST API. - - ``user`` - Jenkins username. - - ``apikey`` - API key generated by Jenkins (not the user password). - - ``credentials`` - If provided, Nodepool will configure the Jenkins slave to use the Jenkins - credential identified by that ID, otherwise it will use the username and - ssh keys configured in the image. - Nodepool reads its configuration from ``/etc/nodepool/nodepool.yaml`` by default. The configuration file follows the standard YAML syntax with a number of sections defined with top level keys. For example, a full configuration file may have the ``diskimages``, ``labels``, -``providers``, and ``targets`` sections:: +and ``providers`` sections:: diskimages: ... @@ -66,8 +34,6 @@ full configuration file may have the ``diskimages``, ``labels``, ... providers: ... - targets: - ... The following sections are available. All are required unless otherwise indicated. @@ -135,8 +101,7 @@ labels Defines the types of nodes that should be created. Maps node types to the images that are used to back them and the providers that are used to supply them. Jobs should be written to run on nodes of a certain -label (so targets such as Jenkins don't need to know about what -providers or images are used to create them). Example:: +label. Example:: labels: - name: my-precise @@ -430,9 +395,8 @@ provider, the Nodepool image types are also defined (see ``ipv6-preferred`` If it is set to True, nodepool will try to find ipv6 in public net first - as the ip address for ssh connection to build snapshot images and create - jenkins slave definition. If ipv6 is not found or the key is not - specified or set to False, ipv4 address will be used. + as the ip address for the ssh connection. If ipv6 is not found or the key + is not specified or set to False, ipv4 address will be used. ``api-timeout`` (compatability) Timeout for the OpenStack API calls client in seconds. Prefer setting @@ -533,41 +497,3 @@ Example configuration:: Arbitrary key/value metadata to store for this server using the Nova metadata service. A maximum of five entries is allowed, and both keys and values must be 255 characters or less. - -.. _targets: - -targets -------- - -Lists the Jenkins masters to which Nodepool should attach nodes after -they are created. Nodes of each label will be evenly distributed -across all of the targets which are on-line:: - - targets: - - name: jenkins1 - - name: jenkins2 - -**required** - - ``name`` - Identifier for the system an instance is attached to. - -**optional** - - ``rate`` - In seconds. Default 1.0 - - ``jenkins`` (dict) - - ``test-job`` (optional) - Setting this would cause a newly created instance to be in a TEST state. - The job name given will then be executed with the node name as a - parameter. - - If the job succeeds, move the node into READY state and relabel it with - the appropriate label (from the image name). - - If it fails, immediately delete the node. - - If the job never runs, the node will eventually be cleaned up by the - periodic cleanup task. diff --git a/nodepool/cmd/config_validator.py b/nodepool/cmd/config_validator.py index f49ffe3d2..f8b463762 100644 --- a/nodepool/cmd/config_validator.py +++ b/nodepool/cmd/config_validator.py @@ -92,17 +92,6 @@ class ConfigValidator: }], } - targets = { - 'name': str, - 'jenkins': { - 'url': str, - 'user': str, - 'apikey': str, - 'credentials-id': str, - 'test-job': str - } - } - diskimages = { 'name': str, 'pause': bool, @@ -125,7 +114,6 @@ class ConfigValidator: 'cron': cron, 'providers': [providers], 'labels': [labels], - 'targets': [targets], 'diskimages': [diskimages], } diff --git a/nodepool/config.py b/nodepool/config.py index 0f76c551b..aa77cc11e 100644 --- a/nodepool/config.py +++ b/nodepool/config.py @@ -82,11 +82,6 @@ class ProviderImage(ConfigValue): return "" % self.name -class Target(ConfigValue): - def __repr__(self): - return "" % self.name - - class Label(ConfigValue): def __repr__(self): return "