Task Manager: Log if the lock takes too long

In order to troubleshoot some of the locking issues, we need to
understand if we're getting held up on the start of the lock or
on the far end of the log upon completing the actual needful
work.

This change adds additional logging on the front end if it is
on the front end, and also sets the rpc object to None explicitly
before returning the update_node result, since it appears a stream
of them can cause issues for sqlite's release of the database from
one thread to another.

Change-Id: I8ef68ff62e8f93f7bc5bd9ee7c6ac8d96328f929
This commit is contained in:
Julia Kreger 2023-08-22 15:37:30 -07:00 committed by Doug Goldstein
parent e0e43f6353
commit 757bf27cac
No known key found for this signature in database
2 changed files with 11 additions and 1 deletions

View File

@ -3204,7 +3204,6 @@ class NodesController(rest.RestController):
chassis_uuid = api_node.get('chassis_uuid')
notify.emit_end_notification(context, new_node, 'update',
chassis_uuid=chassis_uuid)
return api_node
@METRICS.timer('NodesController.delete')

View File

@ -338,6 +338,10 @@ class TaskManager(object):
else:
stop_after = tenacity.stop_after_attempt(1)
max_lock_time = \
CONF.conductor.node_locked_retry_interval * \
CONF.conductor.node_locked_retry_attempts
# NodeLocked exceptions can be annoying. Let's try to alleviate
# some of that pain by retrying our lock attempts.
@tenacity.retry(
@ -347,6 +351,13 @@ class TaskManager(object):
CONF.conductor.node_locked_retry_interval),
reraise=True)
def reserve_node():
if self._debug_timer.elapsed() > max_lock_time:
LOG.warning('We have exceeded the normal maximum time window '
'to complete a node lock attempting to reserve '
'node %(node)s for purpose %(purpose)s. At '
'%(time).2f seconds.',
{'node': self.node_id, 'purpose': self._purpose,
'time': self._debug_timer.elapsed()})
self.node = objects.Node.reserve(self.context, CONF.host,
self.node_id)
LOG.debug("Node %(node)s successfully reserved for %(purpose)s "