From 2fd3d8f01ef471b7f623291406528818223251b8 Mon Sep 17 00:00:00 2001 From: Julia Kreger Date: Tue, 22 Aug 2023 16:02:11 -0700 Subject: [PATCH] Fail on node lookup if it is locked In the agent token mechanism, restrictions exist when a an agent token can be generated, and unfortunately this has to be done on the conductor side involving a lock and a task because we need to save the state of the node. As such, we were in a situation where we were waiting on DB node locking, which would prevent the agent from getting a node, and potentially causing the lookup operation to fail, eventually. We now quickly return NodeLocked which shouldn't cause the agent any issues, although we need to improve error handling there as well. Change-Id: Ice335eed82b936753be99eedb16ceccf8a9a86a8 --- ironic/conductor/manager.py | 5 ++++- ironic/tests/unit/conductor/test_manager.py | 10 ++++++++++ .../fail-fast-on-lookup-lock-a408feac87890050.yaml | 7 +++++++ 3 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 releasenotes/notes/fail-fast-on-lookup-lock-a408feac87890050.yaml diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index fc53595908..47318d47a5 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -3585,7 +3585,10 @@ class ConductorManager(base_manager.BaseConductorManager): # pre-generation of tokens with virtual media usage. node.set_driver_internal_info('agent_secret_token', "******") return node - task.upgrade_lock() + # Do not retry the lock, fail immediately otherwise + # we can cause these requests to stack up on the API, + # all thinking they can process the node. + task.upgrade_lock(retry=False) LOG.debug('Generating agent token for node %(node)s', {'node': task.node.uuid}) utils.add_secret_token(task.node) diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index 582c87d80c..fd2e43a50e 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -3717,6 +3717,16 @@ class MiscTestCase(mgr_utils.ServiceSetUpMixin, mgr_utils.CommonMixIn, self.assertEqual('******', res.driver_internal_info['agent_secret_token']) + def test_node_with_token_already_locked(self): + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + network_interface='noop', + reservation='meow') + exc = self.assertRaises(messaging.rpc.ExpectedException, + self.service.get_node_with_token, + self.context, node.id) + self.assertEqual(exception.NodeLocked, exc.exc_info[0]) + @mgr_utils.mock_record_keepalive class ConsoleTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): diff --git a/releasenotes/notes/fail-fast-on-lookup-lock-a408feac87890050.yaml b/releasenotes/notes/fail-fast-on-lookup-lock-a408feac87890050.yaml new file mode 100644 index 0000000000..1f905bbcaa --- /dev/null +++ b/releasenotes/notes/fail-fast-on-lookup-lock-a408feac87890050.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + Fixes an issue where lookups to generate an agent token would stack up as + the internal lock upgrade logic silently holds on to the request while trying + to obtain a lock. The task creation will now immediately fail with a + ``NodeLocked`` exception, which the agent will retry.