Task Manager: Log if the lock takes too long

In order to troubleshoot some of the locking issues, we need to understand if we're getting held up on the start of the lock or on the far end of the log upon completing the actual needful work. This change adds additional logging on the front end if it is on the front end, and also sets the rpc object to None explicitly before returning the update_node result, since it appears a stream of them can cause issues for sqlite's release of the database from one thread to another. Change-Id: I8ef68ff62e8f93f7bc5bd9ee7c6ac8d96328f929
2023-08-22 15:37:30 -07:00 · 2023-08-22 15:37:30 -07:00 · 757bf27cac
commit 757bf27cac
parent e0e43f6353
2 changed files with 11 additions and 1 deletions
--- a/ironic/api/controllers/v1/node.py
+++ b/ironic/api/controllers/v1/node.py
@ -3204,7 +3204,6 @@ class NodesController(rest.RestController):
        chassis_uuid = api_node.get('chassis_uuid')
        notify.emit_end_notification(context, new_node, 'update',
                                     chassis_uuid=chassis_uuid)
-
        return api_node

    @METRICS.timer('NodesController.delete')
--- a/ironic/conductor/task_manager.py
+++ b/ironic/conductor/task_manager.py
@ -338,6 +338,10 @@ class TaskManager(object):
        else:
            stop_after = tenacity.stop_after_attempt(1)

+        max_lock_time = \
+            CONF.conductor.node_locked_retry_interval * \
+            CONF.conductor.node_locked_retry_attempts
+
        # NodeLocked exceptions can be annoying. Let's try to alleviate
        # some of that pain by retrying our lock attempts.
        @tenacity.retry(
@ -347,6 +351,13 @@ class TaskManager(object):
                CONF.conductor.node_locked_retry_interval),
            reraise=True)
        def reserve_node():
+            if self._debug_timer.elapsed() > max_lock_time:
+                LOG.warning('We have exceeded the normal maximum time window '
+                            'to complete a node lock attempting to reserve '
+                            'node %(node)s for purpose %(purpose)s. At '
+                            '%(time).2f seconds.',
+                            {'node': self.node_id, 'purpose': self._purpose,
+                             'time': self._debug_timer.elapsed()})
            self.node = objects.Node.reserve(self.context, CONF.host,
                                             self.node_id)
            LOG.debug("Node %(node)s successfully reserved for %(purpose)s "