Add hold command to disable nodes

This allows nodes to be set in an idle state
so that they will not have jobs scheduled
while e.g. maintenance tasks are performed.
This is probably most useful for static nodes.

Change-Id: Iebc6b909f370fca11fab2be0b8805d4daef33afe
This commit is contained in:
mbecker 2022-08-22 16:30:36 +02:00
parent 95b3d4c302
commit 1658aa9851
4 changed files with 188 additions and 3 deletions

View File

@ -186,6 +186,11 @@ delete
.. program-output:: nodepool delete --help .. program-output:: nodepool delete --help
:nostderr: :nostderr:
hold
^^^^
.. program-output:: nodepool hold --help
:nostderr:
The following subcommands deal with ZooKeeper data management: The following subcommands deal with ZooKeeper data management:
info info

View File

@ -88,6 +88,13 @@ class NodePoolCmd(NodepoolApp):
action='store_true', action='store_true',
help='delete the node in the foreground') help='delete the node in the foreground')
cmd_hold = subparsers.add_parser(
'hold',
help='place a node in the HOLD state '
'e.g. for running maintenance tasks')
cmd_hold.set_defaults(func=self.hold)
cmd_hold.add_argument('id', help='node id')
cmd_image_delete = subparsers.add_parser( cmd_image_delete = subparsers.add_parser(
'image-delete', 'image-delete',
help='delete an image') help='delete an image')
@ -303,6 +310,23 @@ class NodePoolCmd(NodepoolApp):
self.list(node_id=node.id) self.list(node_id=node.id)
def _change_node_state(self, new_state):
node = self.zk.getNode(self.args.id)
if not node:
print("Node id %s not found" % self.args.id)
return
self.zk.lockNode(node, blocking=True, timeout=5)
node.state = new_state
self.zk.storeNode(node)
self.zk.unlockNode(node)
self.list(node_id=node.id)
def hold(self):
self._change_node_state(zk.HOLD)
def dib_image_delete(self): def dib_image_delete(self):
(image, build_num) = self.args.id.rsplit('-', 1) (image, build_num) = self.args.id.rsplit('-', 1)
build = self.zk.getBuild(image, build_num) build = self.zk.getBuild(image, build_num)
@ -434,7 +458,7 @@ class NodePoolCmd(NodepoolApp):
'image-status', 'image-status',
'image-list', 'dib-image-delete', 'image-list', 'dib-image-delete',
'image-delete', 'alien-image-list', 'image-delete', 'alien-image-list',
'list', 'delete', 'list', 'delete', 'hold',
'request-list', 'info', 'erase', 'request-list', 'info', 'erase',
'image-pause', 'image-unpause', 'image-pause', 'image-unpause',
'export-image-data', 'import-image-data'): 'export-image-data', 'import-image-data'):

View File

@ -618,13 +618,13 @@ class DBTestCase(BaseTestCase):
if node.state == state: if node.state == state:
return node return node
def waitForNodeRequest(self, req, states=None): def waitForNodeRequest(self, req, states=None, max_time=ONE_MINUTE):
''' '''
Wait for a node request to transition to a final state. Wait for a node request to transition to a final state.
''' '''
if states is None: if states is None:
states = (zk.FULFILLED, zk.FAILED) states = (zk.FULFILLED, zk.FAILED)
for _ in iterate_timeout(ONE_MINUTE, Exception, for _ in iterate_timeout(max_time, Exception,
"Node request state transition", "Node request state transition",
interval=1): interval=1):
req = self.zk.getNodeRequest(req.id) req = self.zk.getNodeRequest(req.id)

View File

@ -22,6 +22,7 @@ import fixtures
import mock import mock
import testtools import testtools
from nodepool import exceptions as npe
from nodepool.cmd import nodepoolcmd from nodepool.cmd import nodepoolcmd
from nodepool import tests from nodepool import tests
from nodepool.zk import zookeeper as zk from nodepool.zk import zookeeper as zk
@ -350,6 +351,161 @@ class TestNodepoolCMD(tests.DBTestCase):
# Assert the node is gone # Assert the node is gone
self.assert_listed(configfile, ['list'], 0, nodes[0].id, 0) self.assert_listed(configfile, ['list'], 0, nodes[0].id, 0)
def test_hold(self):
configfile = self.setup_config('node.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)
self.useBuilder(configfile)
pool.start()
self.waitForImage('fake-provider', 'fake-image')
nodes = self.waitForNodes('fake-label')
self.assertEqual(len(nodes), 1)
# Assert one node exists and it is nodes[0].id in a ready state.
self.assert_listed(configfile, ['list'], 0, nodes[0].id, 1)
self.assert_nodes_listed(configfile, 1, zk.READY)
# Hold node
self.patch_argv('-c', configfile, 'hold', nodes[0].id)
nodepoolcmd.main()
# Assert the node is on hold
self.assert_listed(configfile, ['list'], 0, nodes[0].id, 1)
self.assert_nodes_listed(configfile, 1, zk.HOLD)
# Re-enable node by deleting
old_node_id = nodes[0].id
self.patch_argv('-c', configfile, 'delete', nodes[0].id)
nodepoolcmd.main()
# Assert that the node is ready
self.waitForNodeDeletion(nodes[0])
new_nodes = self.waitForNodes('fake-label')
self.assertEqual(len(new_nodes), 1)
self.assert_listed(configfile, ['list'], 0, new_nodes[0].id, 1)
self.assert_nodes_listed(configfile, 1, zk.READY)
self.assertNotEqual(old_node_id, new_nodes[0].id)
# Request a node
req = zk.NodeRequest()
req.state = zk.REQUESTED
req.node_types.append('fake-label')
self.assertEqual(len(req.nodes), 0)
self.zk.storeNodeRequest(req)
self.log.debug("Waiting for request %s", req.id)
req = self.waitForNodeRequest(req, (zk.FULFILLED,))
self.assertEqual(len(req.nodes), 1)
def test_attempt_hold_busy_node(self):
configfile = self.setup_config('node.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)
self.useBuilder(configfile)
pool.start()
self.waitForImage('fake-provider', 'fake-image')
nodes = self.waitForNodes('fake-label')
self.assertEqual(len(nodes), 1)
# Assert one node exists and it is nodes[0].id in a ready state.
self.assert_listed(configfile, ['list'], 0, nodes[0].id, 1)
self.assert_nodes_listed(configfile, 1, zk.READY)
# Request a node
req1 = zk.NodeRequest()
req1.state = zk.REQUESTED
req1.node_types.append('fake-label')
self.zk.storeNodeRequest(req1)
# Wait for node request
self.log.debug("Waiting for 1st request %s", req1.id)
req1 = self.waitForNodeRequest(req1, (zk.FULFILLED,))
self.assertEqual(len(req1.nodes), 1)
# Lock node and set it as in-use
node = self.zk.getNode(req1.nodes[0])
self.zk.lockNode(node, blocking=False)
node.state = zk.IN_USE
self.zk.storeNode(node)
self.assert_listed(configfile, ['list'], 0, nodes[0].id, 1)
self.assert_nodes_listed(configfile, 1, zk.IN_USE)
# Attempt to hold the node, this should fail
# since another process holds the lock
with testtools.ExpectedException(npe.TimeoutException):
self.patch_argv('-c', configfile, 'hold', nodes[0].id)
nodepoolcmd.main()
def test_attempt_request_held_static_node(self):
configfile = self.setup_config('static-basic.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)
pool.start()
nodes = self.waitForNodes('fake-label')
self.assertEqual(len(nodes), 1)
# Assert one node exists and it is nodes[0].id in a ready state.
self.assert_listed(configfile, ['list'], 0, nodes[0].id, 1)
self.assert_nodes_listed(configfile, 1, zk.READY)
# Hold node
self.patch_argv('-c', configfile, 'hold', nodes[0].id)
nodepoolcmd.main()
# Assert the node is on HOLD
self.assertEqual(len(nodes), 1)
self.assert_listed(configfile, ['list'], 0, nodes[0].id, 1)
self.assert_nodes_listed(configfile, 1, zk.HOLD)
# Prepare node request
req = zk.NodeRequest()
req.state = zk.REQUESTED
req.node_types.append('fake-label')
self.zk.storeNodeRequest(req)
# Make a node request
# Expect to timeout since the node is not ready
self.log.debug("Waiting for request %s", req.id)
req = self.zk.getNodeRequest(req.id)
with testtools.ExpectedException(Exception):
req = self.waitForNodeRequest(req, (zk.FULFILLED,), max_time=30)
self.assertEqual(len(req.nodes), 0)
def test_attempt_request_held_node(self):
configfile = self.setup_config('node.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)
self.useBuilder(configfile)
pool.start()
self.waitForImage('fake-provider', 'fake-image')
nodes = self.waitForNodes('fake-label')
self.assertEqual(len(nodes), 1)
# Assert one node exists and it is nodes[0].id in a ready state.
self.assert_listed(configfile, ['list'], 0, nodes[0].id, 1)
self.assert_nodes_listed(configfile, 1, zk.READY)
# Hold node
self.patch_argv('-c', configfile, 'hold', nodes[0].id)
nodepoolcmd.main()
# Assert the node is on HOLD
self.assertEqual(len(nodes), 1)
self.assert_listed(configfile, ['list'], 0, nodes[0].id, 1)
self.assert_nodes_listed(configfile, 1, zk.HOLD)
# Prepare node request
req = zk.NodeRequest()
req.state = zk.REQUESTED
req.node_types.append('fake-label')
self.zk.storeNodeRequest(req)
# Make a node request
self.log.debug("Waiting for request %s", req.id)
req = self.waitForNodeRequest(req, (zk.FULFILLED,))
# Make sure we did not assign the held node
# but another node as long as the quota is not reached
self.assertNotEqual(nodes[0].id, req.nodes[0])
def test_image_build(self): def test_image_build(self):
configfile = self.setup_config('node.yaml') configfile = self.setup_config('node.yaml')
self.useBuilder(configfile) self.useBuilder(configfile)