From 6da857c0ae60c1e8d2558c9fcb2e23cc82c43cfc Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 22 Jun 2016 12:12:15 -0700 Subject: [PATCH] Add auto-hold feature This adds a new table and series of commands to manipulate it in which an operator may indicate that nodes which have run failed instances of specified jobs should automatically be held. Change-Id: I69b00fbdeed4fba086a54f051bbb51384ea26a70 --- doc/source/operation.rst | 22 +++++++++++ nodepool/cmd/nodepoolcmd.py | 39 +++++++++++++++++++ nodepool/nodedb.py | 44 ++++++++++++++++++++++ nodepool/nodepool.py | 18 +++++++++ nodepool/tests/test_commands.py | 17 +++++++++ nodepool/tests/test_nodepool.py | 67 +++++++++++++++++++++++++++++++++ 6 files changed, 207 insertions(+) diff --git a/doc/source/operation.rst b/doc/source/operation.rst index 41a6a1e10..1c780f175 100644 --- a/doc/source/operation.rst +++ b/doc/source/operation.rst @@ -147,6 +147,28 @@ alien-image-list .. program-output:: nodepool alien-image-list --help :nostderr: +In the case that a job is randomly failing for an unknown cause, it +may be necessary to instruct nodepool to automatically hold a node on +which that job has failed. To do so, use the the ``job-create`` +command to specify the job name and how many failed nodes should be +held. When debugging is complete, use ''job-delete'' to disable the +feature. + +job-create +^^^^^^^^^^ +.. program-output:: nodepool job-create --help + :nostderr: + +job-list +^^^^^^^^ +.. program-output:: nodepool job-list --help + :nostderr: + +job-delete +^^^^^^^^^^ +.. program-output:: nodepool job-delete --help + :nostderr: + Removing a Provider =================== diff --git a/nodepool/cmd/nodepoolcmd.py b/nodepool/cmd/nodepoolcmd.py index 57f79fba7..79e339d7a 100644 --- a/nodepool/cmd/nodepoolcmd.py +++ b/nodepool/cmd/nodepoolcmd.py @@ -144,6 +144,23 @@ class NodePoolCmd(object): help='Validate configuration file') cmd_config_validate.set_defaults(func=self.config_validate) + cmd_job_list = subparsers.add_parser('job-list', help='list jobs') + cmd_job_list.set_defaults(func=self.job_list) + + cmd_job_create = subparsers.add_parser('job-create', help='create job') + cmd_job_create.add_argument( + 'name', + help='job name') + cmd_job_create.add_argument('--hold-on-failure', + help='number of nodes to hold when this job fails') + cmd_job_create.set_defaults(func=self.job_create) + + cmd_job_delete = subparsers.add_parser( + 'job-delete', + help='delete job') + cmd_job_delete.set_defaults(func=self.job_delete) + cmd_job_delete.add_argument('id', help='job id') + self.args = parser.parse_args() def setup_logging(self): @@ -374,6 +391,28 @@ class NodePoolCmd(object): log.info("Configuation validation complete") #TODO(asselin,yolanda): add validation of secure.conf + def job_list(self): + t = PrettyTable(["ID", "Name", "Hold on Failure"]) + t.align = 'l' + with self.pool.getDB().getSession() as session: + for job in session.getJobs(): + t.add_row([job.id, job.name, job.hold_on_failure]) + print t + + def job_create(self): + with self.pool.getDB().getSession() as session: + session.createJob(self.args.name, + hold_on_failure=self.args.hold_on_failure) + self.job_list() + + def job_delete(self): + with self.pool.getDB().getSession() as session: + job = session.getJob(self.args.id) + if not job: + print "Job %s not found." % self.args.id + else: + job.delete() + def _wait_for_threads(self, threads): for t in threads: if t: diff --git a/nodepool/nodedb.py b/nodepool/nodedb.py index 95265ee7b..ff3d671a4 100644 --- a/nodepool/nodedb.py +++ b/nodepool/nodedb.py @@ -126,6 +126,15 @@ subnode_table = Table( Column('state_time', Integer), mysql_engine='InnoDB', ) +job_table = Table( + 'job', metadata, + Column('id', Integer, primary_key=True), + # The name of the job + Column('name', String(255), index=True), + # Automatically hold up to this number of nodes that fail this job + Column('hold_on_failure', Integer), + mysql_engine='InnoDB', + ) class DibImage(object): @@ -249,6 +258,20 @@ class SubNode(object): session.commit() +class Job(object): + def __init__(self, name=None, hold_on_failure=0): + self.name = name + self.hold_on_failure = hold_on_failure + + def delete(self): + session = Session.object_session(self) + session.delete(self) + session.commit() + + +mapper(Job, job_table) + + mapper(SubNode, subnode_table, properties=dict(_state=subnode_table.c.state)) @@ -460,3 +483,24 @@ class NodeDatabaseSession(object): if not nodes: return None return nodes[0] + + def getJob(self, id): + jobs = self.session().query(Job).filter_by(id=id).all() + if not jobs: + return None + return jobs[0] + + def getJobByName(self, name): + jobs = self.session().query(Job).filter_by(name=name).all() + if not jobs: + return None + return jobs[0] + + def getJobs(self): + return self.session().query(Job).all() + + def createJob(self, *args, **kwargs): + new = Job(*args, **kwargs) + self.session().add(new) + self.commit() + return new diff --git a/nodepool/nodepool.py b/nodepool/nodepool.py index af7a13722..80dd75aff 100644 --- a/nodepool/nodepool.py +++ b/nodepool/nodepool.py @@ -107,6 +107,24 @@ class NodeCompleteThread(threading.Thread): node.id) return + nodepool_job = session.getJobByName(self.jobname) + if (nodepool_job and nodepool_job.hold_on_failure and + self.result != 'SUCCESS'): + held_nodes = session.getNodes(state=nodedb.HOLD) + held_nodes = [n for n in held_nodes if self.jobname in n.comment] + if len(held_nodes) >= nodepool_job.hold_on_failure: + self.log.info("Node id: %s has failed %s but %s nodes " + "are already held for that job" % ( + node.id, self.jobname, len(held_nodes))) + else: + node.state = nodedb.HOLD + node.comment = "Automatically held after failing %s" % ( + self.jobname,) + self.log.info("Node id: %s failed %s, automatically holding" % ( + node.id, self.jobname)) + self.nodepool.updateStats(session, node.provider_name) + return + target = self.nodepool.config.targets[node.target_name] if self.jobname == target.jenkins_test_job: self.log.debug("Test job for node id: %s complete, result: %s" % diff --git a/nodepool/tests/test_commands.py b/nodepool/tests/test_commands.py index b50b25530..2e832c66c 100644 --- a/nodepool/tests/test_commands.py +++ b/nodepool/tests/test_commands.py @@ -257,3 +257,20 @@ class TestNodepoolCMD(tests.DBTestCase): self.patch_argv("-c", configfile, "image-build", "fake-dib-diskimage") nodepoolcmd.main() self.assert_listed(configfile, ['dib-image-list'], 4, 'ready', 1) + + def test_job_create(self): + configfile = self.setup_config('node.yaml') + self.patch_argv("-c", configfile, "job-create", "fake-job", + "--hold-on-failure", "1") + nodepoolcmd.main() + self.assert_listed(configfile, ['job-list'], 2, 1, 1) + + def test_job_delete(self): + configfile = self.setup_config('node.yaml') + self.patch_argv("-c", configfile, "job-create", "fake-job", + "--hold-on-failure", "1") + nodepoolcmd.main() + self.assert_listed(configfile, ['job-list'], 2, 1, 1) + self.patch_argv("-c", configfile, "job-delete", "1") + nodepoolcmd.main() + self.assert_listed(configfile, ['job-list'], 0, 1, 0) diff --git a/nodepool/tests/test_nodepool.py b/nodepool/tests/test_nodepool.py index 755f4fdb1..34ede0460 100644 --- a/nodepool/tests/test_nodepool.py +++ b/nodepool/tests/test_nodepool.py @@ -625,6 +625,73 @@ class TestNodepool(tests.DBTestCase): node = session.getNode(1) self.assertEqual(node, None) + def _test_job_auto_hold(self, result): + configfile = self.setup_config('node.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + pool.start() + + self.waitForImage(pool, 'fake-provider', 'fake-image') + self.waitForNodes(pool) + + with pool.getDB().getSession() as session: + session.createJob('fake-job', hold_on_failure=1) + + msg_obj = {'name': 'fake-job', + 'build': {'node_name': 'fake-label-fake-provider-1', + 'status': result}} + json_string = json.dumps(msg_obj) + # Don't delay when deleting. + self.useFixture(fixtures.MonkeyPatch( + 'nodepool.nodepool.DELETE_DELAY', + 0)) + handler = nodepool.nodepool.NodeUpdateListener(pool, + 'tcp://localhost:8881') + handler.handleEvent('onFinalized', json_string) + self.wait_for_threads() + return pool + + def test_job_auto_hold_success(self): + """Test that a successful job does not hold a node""" + pool = self._test_job_auto_hold('SUCCESS') + with pool.getDB().getSession() as session: + node = session.getNode(1) + self.assertIsNone(node) + + def test_job_auto_hold_failure(self): + """Test that a failed job automatically holds a node""" + pool = self._test_job_auto_hold('FAILURE') + with pool.getDB().getSession() as session: + node = session.getNode(1) + self.assertEqual(node.state, nodedb.HOLD) + + def test_job_auto_hold_failure_max(self): + """Test that a failed job automatically holds only one node""" + pool = self._test_job_auto_hold('FAILURE') + with pool.getDB().getSession() as session: + node = session.getNode(1) + self.assertEqual(node.state, nodedb.HOLD) + + # Wait for a replacement node + self.waitForNodes(pool) + with pool.getDB().getSession() as session: + node = session.getNode(2) + self.assertEqual(node.state, nodedb.READY) + + # Fail the job again + msg_obj = {'name': 'fake-job', + 'build': {'node_name': 'fake-label-fake-provider-2', + 'status': 'FAILURE'}} + json_string = json.dumps(msg_obj) + handler = nodepool.nodepool.NodeUpdateListener(pool, + 'tcp://localhost:8881') + handler.handleEvent('onFinalized', json_string) + self.wait_for_threads() + + # Ensure that the second node was deleted + with pool.getDB().getSession() as session: + node = session.getNode(2) + self.assertEqual(node, None) + class TestGearClient(tests.DBTestCase): def test_wait_for_completion(self):