From bfc2ad56d558fbb73497dcf641b61a5ab0c23b5f Mon Sep 17 00:00:00 2001
From: Jason Anderson <jasonanderson@uchicago.edu>
Date: Thu, 5 Nov 2020 12:14:33 -0600
Subject: [PATCH] Always retry locking when performing task handoff

There are some Ironic execution workflows where there is not an easy way
to retry, such as when attempting to hand off the processing of an async
task to a conductor. Task handoff can require releasing a lock on the
node, so the next entity processing the task can acquire the lock
itself. However, this is vulnerable to race conditions, as there is no
uniform retry mechanism built in to such handoffs. Consider the
continue_node_deploy/clean logic, which does this:

  method = 'continue_node_%s' % operation
  # Need to release the lock to let the conductor take it
  task.release_resources()
  getattr(rpc, method)(task.context, uuid, topic=topic

If another process obtains a lock between the releasing of resources and
the acquiring of the lock during the continue_node_* operation, and
holds the lock longer than the max attempt * interval window (which
defaults to 3 seconds), then the handoff will never complete. Beyond
that, because there is no proper queue for processes waiting on the
lock, there is no fairness, so it's also possible that instead of one
long lock being held, the lock is obtained and held for a short window
several times by other competing processes.

This manifests as nodes occasionally getting stuck in the "DEPLOYING"
state during a deploy. For example, a user may attempt to open or access
the serial console before the deploy is complete--the serial console
process obtains a lock and starves the conductor of the lock, so the
conductor cannot finish the deploy. It's also possible a long heartbeat
or badly-timed sequence of heartbeats could do the same.

To fix this, this commit introduces the concept of a "patient" lock,
which will retry indefinitely until it doesn't encounter the NodeLocked
exception. This overrides any retry behavior.

  .. note::
     There may be other cases where such a lock is desired.

Story: #2008323
Change-Id: I9937fab18a50111ec56a3fd023cdb9d510a1e990
---
 ironic/conductor/manager.py                   |  4 +-
 ironic/conductor/task_manager.py              | 13 +++++-
 ironic/tests/unit/conductor/test_manager.py   | 43 +++++++++++++++++++
 .../tests/unit/conductor/test_task_manager.py | 19 ++++++++
 ...tuck-deploying-state-43d51149a02c08b8.yaml |  7 +++
 5 files changed, 82 insertions(+), 4 deletions(-)
 create mode 100644 releasenotes/notes/story-2008323-fix-stuck-deploying-state-43d51149a02c08b8.yaml

diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py
index 7fcccec041..66013b99d6 100644
--- a/ironic/conductor/manager.py
+++ b/ironic/conductor/manager.py
@@ -861,7 +861,7 @@ class ConductorManager(base_manager.BaseConductorManager):
 
         """
         LOG.debug("RPC continue_node_deploy called for node %s.", node_id)
-        with task_manager.acquire(context, node_id, shared=False,
+        with task_manager.acquire(context, node_id, shared=False, patient=True,
                                   purpose='continue node deploying') as task:
             node = task.node
 
@@ -1118,7 +1118,7 @@ class ConductorManager(base_manager.BaseConductorManager):
         """
         LOG.debug("RPC continue_node_clean called for node %s.", node_id)
 
-        with task_manager.acquire(context, node_id, shared=False,
+        with task_manager.acquire(context, node_id, shared=False, patient=True,
                                   purpose='continue node cleaning') as task:
             node = task.node
             if node.target_provision_state == states.MANAGEABLE:
diff --git a/ironic/conductor/task_manager.py b/ironic/conductor/task_manager.py
index 7ed3eab3c4..8a337ee332 100644
--- a/ironic/conductor/task_manager.py
+++ b/ironic/conductor/task_manager.py
@@ -170,7 +170,7 @@ class TaskManager(object):
     """
 
     def __init__(self, context, node_id, shared=False,
-                 purpose='unspecified action', retry=True,
+                 purpose='unspecified action', retry=True, patient=False,
                  load_driver=True):
         """Create a new TaskManager.
 
@@ -185,6 +185,12 @@ class TaskManager(object):
                        lock. Default: False.
         :param purpose: human-readable purpose to put to debug logs.
         :param retry: whether to retry locking if it fails. Default: True.
+        :param patient: whether to indefinitely retry locking if it fails.
+                        Set this to True if there is an operation that does not
+                        have any fallback or built-in retry mechanism, such as
+                        finalizing a state transition during deploy/clean.
+                        The default retry behavior is to retry a configured
+                        number of times and then give up. Default: False.
         :param load_driver: whether to load the ``driver`` object. Set this to
                             False if loading the driver is undesired or
                             impossible.
@@ -203,6 +209,7 @@ class TaskManager(object):
         self.node_id = node_id
         self.shared = shared
         self._retry = retry
+        self._patient = patient
 
         self.fsm = states.machine.copy()
         self._purpose = purpose
@@ -260,7 +267,9 @@ class TaskManager(object):
     def _lock(self):
         self._debug_timer.restart()
 
-        if self._retry:
+        if self._patient:
+            attempts = None
+        elif self._retry:
             attempts = CONF.conductor.node_locked_retry_attempts
         else:
             attempts = 1
diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py
index 648e86c981..93023484f7 100644
--- a/ironic/tests/unit/conductor/test_manager.py
+++ b/ironic/tests/unit/conductor/test_manager.py
@@ -1784,6 +1784,27 @@ class ContinueNodeDeployTestCase(mgr_utils.ServiceSetUpMixin,
                                       deployments.continue_node_deploy,
                                       mock.ANY)
 
+    @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
+                autospec=True)
+    def test_continue_node_deploy_locked(self, mock_spawn):
+        """Test that continuing a deploy via RPC cannot fail due to locks."""
+        max_attempts = 3
+        self.config(node_locked_retry_attempts=max_attempts, group='conductor')
+        prv_state = states.DEPLOYWAIT
+        tgt_prv_state = states.ACTIVE
+        node = obj_utils.create_test_node(self.context, driver='fake-hardware',
+                                          provision_state=prv_state,
+                                          target_provision_state=tgt_prv_state,
+                                          last_error=None,
+                                          deploy_step=self.deploy_steps[0])
+        self._start_service()
+        with mock.patch.object(objects.Node, 'reserve', autospec=True) as mck:
+            mck.side_effect = (
+                ([exception.NodeLocked(node='foo', host='foo')] * max_attempts)
+                + [node])
+            self.service.continue_node_deploy(self.context, node.uuid)
+        self._stop_service()
+
     @mock.patch.object(task_manager.TaskManager, 'process_event',
                        autospec=True)
     @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
@@ -2582,6 +2603,28 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
     def test_continue_node_clean_manual_abort_last_clean_step(self):
         self._continue_node_clean_abort_last_clean_step(manual=True)
 
+    @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
+                autospec=True)
+    def test_continue_node_clean_locked(self, mock_spawn):
+        """Test that continuing a clean via RPC cannot fail due to locks."""
+        max_attempts = 3
+        self.config(node_locked_retry_attempts=max_attempts, group='conductor')
+        driver_info = {'clean_steps': [self.clean_steps[0]],
+                       'clean_step_index': 0}
+        tgt_prov_state = states.AVAILABLE
+        node = obj_utils.create_test_node(
+            self.context, driver='fake-hardware',
+            provision_state=states.CLEANWAIT,
+            target_provision_state=tgt_prov_state, last_error=None,
+            driver_internal_info=driver_info, clean_step=self.clean_steps[0])
+        self._start_service()
+        with mock.patch.object(objects.Node, 'reserve', autospec=True) as mck:
+            mck.side_effect = (
+                ([exception.NodeLocked(node='foo', host='foo')] * max_attempts)
+                + [node])
+            self.service.continue_node_clean(self.context, node.uuid)
+        self._stop_service()
+
 
 class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
                            db_base.DbTestCase):
diff --git a/ironic/tests/unit/conductor/test_task_manager.py b/ironic/tests/unit/conductor/test_task_manager.py
index e49982d593..c20d053e3e 100644
--- a/ironic/tests/unit/conductor/test_task_manager.py
+++ b/ironic/tests/unit/conductor/test_task_manager.py
@@ -222,6 +222,25 @@ class TaskManagerTestCase(db_base.DbTestCase):
         reserve_mock.assert_called_once_with(self.context, self.host,
                                              'fake-node-id')
 
+    def test_excl_lock_exception_patient(
+            self, get_voltgt_mock, get_volconn_mock, get_portgroups_mock,
+            get_ports_mock, build_driver_mock,
+            reserve_mock, release_mock, node_get_mock):
+        retry_attempts = 3
+        self.config(node_locked_retry_attempts=retry_attempts,
+                    group='conductor')
+
+        # Fail on the first 3 attempts, succeed on the fourth.
+        reserve_mock.side_effect = (
+            ([exception.NodeLocked(node='foo', host='foo')] * 3) + [self.node])
+
+        task_manager.TaskManager(self.context, 'fake-node-id', patient=True)
+
+        expected_calls = [mock.call(self.context, self.host,
+                                    'fake-node-id')] * 4
+        reserve_mock.assert_has_calls(expected_calls)
+        self.assertEqual(4, reserve_mock.call_count)
+
     def test_excl_lock_reserve_exception(
             self, get_voltgt_mock, get_volconn_mock, get_portgroups_mock,
             get_ports_mock, build_driver_mock,
diff --git a/releasenotes/notes/story-2008323-fix-stuck-deploying-state-43d51149a02c08b8.yaml b/releasenotes/notes/story-2008323-fix-stuck-deploying-state-43d51149a02c08b8.yaml
new file mode 100644
index 0000000000..a8f1abd5e4
--- /dev/null
+++ b/releasenotes/notes/story-2008323-fix-stuck-deploying-state-43d51149a02c08b8.yaml
@@ -0,0 +1,7 @@
+---
+fixes:
+  - |
+    Fixes a bug where a conductor could fail to complete a deployment if there
+    was contention on a shared lock. This would manifest as an instance being
+    stuck in the "deploying" state, though the node had in fact started or even
+    completed its final boot.