Add auto-hold feature
This adds a new table and series of commands to manipulate it in which an operator may indicate that nodes which have run failed instances of specified jobs should automatically be held. Change-Id: I69b00fbdeed4fba086a54f051bbb51384ea26a70
This commit is contained in:
parent
32992cd86d
commit
6da857c0ae
@ -147,6 +147,28 @@ alien-image-list
|
||||
.. program-output:: nodepool alien-image-list --help
|
||||
:nostderr:
|
||||
|
||||
In the case that a job is randomly failing for an unknown cause, it
|
||||
may be necessary to instruct nodepool to automatically hold a node on
|
||||
which that job has failed. To do so, use the the ``job-create``
|
||||
command to specify the job name and how many failed nodes should be
|
||||
held. When debugging is complete, use ''job-delete'' to disable the
|
||||
feature.
|
||||
|
||||
job-create
|
||||
^^^^^^^^^^
|
||||
.. program-output:: nodepool job-create --help
|
||||
:nostderr:
|
||||
|
||||
job-list
|
||||
^^^^^^^^
|
||||
.. program-output:: nodepool job-list --help
|
||||
:nostderr:
|
||||
|
||||
job-delete
|
||||
^^^^^^^^^^
|
||||
.. program-output:: nodepool job-delete --help
|
||||
:nostderr:
|
||||
|
||||
Removing a Provider
|
||||
===================
|
||||
|
||||
|
@ -144,6 +144,23 @@ class NodePoolCmd(object):
|
||||
help='Validate configuration file')
|
||||
cmd_config_validate.set_defaults(func=self.config_validate)
|
||||
|
||||
cmd_job_list = subparsers.add_parser('job-list', help='list jobs')
|
||||
cmd_job_list.set_defaults(func=self.job_list)
|
||||
|
||||
cmd_job_create = subparsers.add_parser('job-create', help='create job')
|
||||
cmd_job_create.add_argument(
|
||||
'name',
|
||||
help='job name')
|
||||
cmd_job_create.add_argument('--hold-on-failure',
|
||||
help='number of nodes to hold when this job fails')
|
||||
cmd_job_create.set_defaults(func=self.job_create)
|
||||
|
||||
cmd_job_delete = subparsers.add_parser(
|
||||
'job-delete',
|
||||
help='delete job')
|
||||
cmd_job_delete.set_defaults(func=self.job_delete)
|
||||
cmd_job_delete.add_argument('id', help='job id')
|
||||
|
||||
self.args = parser.parse_args()
|
||||
|
||||
def setup_logging(self):
|
||||
@ -374,6 +391,28 @@ class NodePoolCmd(object):
|
||||
log.info("Configuation validation complete")
|
||||
#TODO(asselin,yolanda): add validation of secure.conf
|
||||
|
||||
def job_list(self):
|
||||
t = PrettyTable(["ID", "Name", "Hold on Failure"])
|
||||
t.align = 'l'
|
||||
with self.pool.getDB().getSession() as session:
|
||||
for job in session.getJobs():
|
||||
t.add_row([job.id, job.name, job.hold_on_failure])
|
||||
print t
|
||||
|
||||
def job_create(self):
|
||||
with self.pool.getDB().getSession() as session:
|
||||
session.createJob(self.args.name,
|
||||
hold_on_failure=self.args.hold_on_failure)
|
||||
self.job_list()
|
||||
|
||||
def job_delete(self):
|
||||
with self.pool.getDB().getSession() as session:
|
||||
job = session.getJob(self.args.id)
|
||||
if not job:
|
||||
print "Job %s not found." % self.args.id
|
||||
else:
|
||||
job.delete()
|
||||
|
||||
def _wait_for_threads(self, threads):
|
||||
for t in threads:
|
||||
if t:
|
||||
|
@ -126,6 +126,15 @@ subnode_table = Table(
|
||||
Column('state_time', Integer),
|
||||
mysql_engine='InnoDB',
|
||||
)
|
||||
job_table = Table(
|
||||
'job', metadata,
|
||||
Column('id', Integer, primary_key=True),
|
||||
# The name of the job
|
||||
Column('name', String(255), index=True),
|
||||
# Automatically hold up to this number of nodes that fail this job
|
||||
Column('hold_on_failure', Integer),
|
||||
mysql_engine='InnoDB',
|
||||
)
|
||||
|
||||
|
||||
class DibImage(object):
|
||||
@ -249,6 +258,20 @@ class SubNode(object):
|
||||
session.commit()
|
||||
|
||||
|
||||
class Job(object):
|
||||
def __init__(self, name=None, hold_on_failure=0):
|
||||
self.name = name
|
||||
self.hold_on_failure = hold_on_failure
|
||||
|
||||
def delete(self):
|
||||
session = Session.object_session(self)
|
||||
session.delete(self)
|
||||
session.commit()
|
||||
|
||||
|
||||
mapper(Job, job_table)
|
||||
|
||||
|
||||
mapper(SubNode, subnode_table,
|
||||
properties=dict(_state=subnode_table.c.state))
|
||||
|
||||
@ -460,3 +483,24 @@ class NodeDatabaseSession(object):
|
||||
if not nodes:
|
||||
return None
|
||||
return nodes[0]
|
||||
|
||||
def getJob(self, id):
|
||||
jobs = self.session().query(Job).filter_by(id=id).all()
|
||||
if not jobs:
|
||||
return None
|
||||
return jobs[0]
|
||||
|
||||
def getJobByName(self, name):
|
||||
jobs = self.session().query(Job).filter_by(name=name).all()
|
||||
if not jobs:
|
||||
return None
|
||||
return jobs[0]
|
||||
|
||||
def getJobs(self):
|
||||
return self.session().query(Job).all()
|
||||
|
||||
def createJob(self, *args, **kwargs):
|
||||
new = Job(*args, **kwargs)
|
||||
self.session().add(new)
|
||||
self.commit()
|
||||
return new
|
||||
|
@ -107,6 +107,24 @@ class NodeCompleteThread(threading.Thread):
|
||||
node.id)
|
||||
return
|
||||
|
||||
nodepool_job = session.getJobByName(self.jobname)
|
||||
if (nodepool_job and nodepool_job.hold_on_failure and
|
||||
self.result != 'SUCCESS'):
|
||||
held_nodes = session.getNodes(state=nodedb.HOLD)
|
||||
held_nodes = [n for n in held_nodes if self.jobname in n.comment]
|
||||
if len(held_nodes) >= nodepool_job.hold_on_failure:
|
||||
self.log.info("Node id: %s has failed %s but %s nodes "
|
||||
"are already held for that job" % (
|
||||
node.id, self.jobname, len(held_nodes)))
|
||||
else:
|
||||
node.state = nodedb.HOLD
|
||||
node.comment = "Automatically held after failing %s" % (
|
||||
self.jobname,)
|
||||
self.log.info("Node id: %s failed %s, automatically holding" % (
|
||||
node.id, self.jobname))
|
||||
self.nodepool.updateStats(session, node.provider_name)
|
||||
return
|
||||
|
||||
target = self.nodepool.config.targets[node.target_name]
|
||||
if self.jobname == target.jenkins_test_job:
|
||||
self.log.debug("Test job for node id: %s complete, result: %s" %
|
||||
|
@ -257,3 +257,20 @@ class TestNodepoolCMD(tests.DBTestCase):
|
||||
self.patch_argv("-c", configfile, "image-build", "fake-dib-diskimage")
|
||||
nodepoolcmd.main()
|
||||
self.assert_listed(configfile, ['dib-image-list'], 4, 'ready', 1)
|
||||
|
||||
def test_job_create(self):
|
||||
configfile = self.setup_config('node.yaml')
|
||||
self.patch_argv("-c", configfile, "job-create", "fake-job",
|
||||
"--hold-on-failure", "1")
|
||||
nodepoolcmd.main()
|
||||
self.assert_listed(configfile, ['job-list'], 2, 1, 1)
|
||||
|
||||
def test_job_delete(self):
|
||||
configfile = self.setup_config('node.yaml')
|
||||
self.patch_argv("-c", configfile, "job-create", "fake-job",
|
||||
"--hold-on-failure", "1")
|
||||
nodepoolcmd.main()
|
||||
self.assert_listed(configfile, ['job-list'], 2, 1, 1)
|
||||
self.patch_argv("-c", configfile, "job-delete", "1")
|
||||
nodepoolcmd.main()
|
||||
self.assert_listed(configfile, ['job-list'], 0, 1, 0)
|
||||
|
@ -625,6 +625,73 @@ class TestNodepool(tests.DBTestCase):
|
||||
node = session.getNode(1)
|
||||
self.assertEqual(node, None)
|
||||
|
||||
def _test_job_auto_hold(self, result):
|
||||
configfile = self.setup_config('node.yaml')
|
||||
pool = self.useNodepool(configfile, watermark_sleep=1)
|
||||
pool.start()
|
||||
|
||||
self.waitForImage(pool, 'fake-provider', 'fake-image')
|
||||
self.waitForNodes(pool)
|
||||
|
||||
with pool.getDB().getSession() as session:
|
||||
session.createJob('fake-job', hold_on_failure=1)
|
||||
|
||||
msg_obj = {'name': 'fake-job',
|
||||
'build': {'node_name': 'fake-label-fake-provider-1',
|
||||
'status': result}}
|
||||
json_string = json.dumps(msg_obj)
|
||||
# Don't delay when deleting.
|
||||
self.useFixture(fixtures.MonkeyPatch(
|
||||
'nodepool.nodepool.DELETE_DELAY',
|
||||
0))
|
||||
handler = nodepool.nodepool.NodeUpdateListener(pool,
|
||||
'tcp://localhost:8881')
|
||||
handler.handleEvent('onFinalized', json_string)
|
||||
self.wait_for_threads()
|
||||
return pool
|
||||
|
||||
def test_job_auto_hold_success(self):
|
||||
"""Test that a successful job does not hold a node"""
|
||||
pool = self._test_job_auto_hold('SUCCESS')
|
||||
with pool.getDB().getSession() as session:
|
||||
node = session.getNode(1)
|
||||
self.assertIsNone(node)
|
||||
|
||||
def test_job_auto_hold_failure(self):
|
||||
"""Test that a failed job automatically holds a node"""
|
||||
pool = self._test_job_auto_hold('FAILURE')
|
||||
with pool.getDB().getSession() as session:
|
||||
node = session.getNode(1)
|
||||
self.assertEqual(node.state, nodedb.HOLD)
|
||||
|
||||
def test_job_auto_hold_failure_max(self):
|
||||
"""Test that a failed job automatically holds only one node"""
|
||||
pool = self._test_job_auto_hold('FAILURE')
|
||||
with pool.getDB().getSession() as session:
|
||||
node = session.getNode(1)
|
||||
self.assertEqual(node.state, nodedb.HOLD)
|
||||
|
||||
# Wait for a replacement node
|
||||
self.waitForNodes(pool)
|
||||
with pool.getDB().getSession() as session:
|
||||
node = session.getNode(2)
|
||||
self.assertEqual(node.state, nodedb.READY)
|
||||
|
||||
# Fail the job again
|
||||
msg_obj = {'name': 'fake-job',
|
||||
'build': {'node_name': 'fake-label-fake-provider-2',
|
||||
'status': 'FAILURE'}}
|
||||
json_string = json.dumps(msg_obj)
|
||||
handler = nodepool.nodepool.NodeUpdateListener(pool,
|
||||
'tcp://localhost:8881')
|
||||
handler.handleEvent('onFinalized', json_string)
|
||||
self.wait_for_threads()
|
||||
|
||||
# Ensure that the second node was deleted
|
||||
with pool.getDB().getSession() as session:
|
||||
node = session.getNode(2)
|
||||
self.assertEqual(node, None)
|
||||
|
||||
|
||||
class TestGearClient(tests.DBTestCase):
|
||||
def test_wait_for_completion(self):
|
||||
|
Loading…
x
Reference in New Issue
Block a user