DB: Only re-query for a lock holder if we cannot lock

Dbapi method _reserve_node_place_lock is a bit of a special
method. It has both a decorator to retry sqlite "database is locked"
issues, and an outer synchronized process fair lock
(from oslo.concurrency.lockutils), which ensures only *one* thread
is working on locks at a time.

Thing is, we can build contention when a stack of heartbeats
come in, because they are forced to execute in serialized fashion.

And whil investigating some metal3 logs, we could see some lock
interactions are basically instant, and when things begin to
get backed up, we start seeing 10+ second gaps where we are
trying to get ahold of the database, and can't lock the node.

And looking at the code for the method, I realized we were *always*
re-querying the node, but never returning it after updating the node.
Apparently, so we can just log *if* there was an issue.

Instead, just consult the result set and then re-query if we must
to determine *who* holds the lock, we now only do so *if* we are
operating without SQLite, because if we are then we can safely
assume the lock came from another thread.

Change-Id: Ie606439670be21cf267eb541ce864711d2097207
This commit is contained in:
Julia Kreger 2023-09-01 06:49:29 -07:00
parent bcfddda517
commit bb02c49def
2 changed files with 41 additions and 3 deletions

View File

@ -714,14 +714,17 @@ class Connection(api.Connection):
values(reservation=tag).
execution_options(synchronize_session=False))
session.flush()
node = self._get_node_reservation(node.id)
# NOTE(TheJulia): In SQLAlchemy 2.0 style, we don't
# magically get a changed node as they moved from the
# many ways to do things to singular ways to do things.
if res.rowcount != 1:
# Nothing updated and node exists. Must already be
# locked.
raise exception.NodeLocked(node=node.uuid, host=node.reservation)
# locked. Identify who holds it and log.
if utils.is_ironic_using_sqlite():
lock_holder = CONF.hostname
else:
lock_holder = self._get_node_reservation(node.id).reservation
raise exception.NodeLocked(node=node.uuid, host=lock_holder)
@oslo_db_api.retry_on_deadlock
def reserve_node(self, tag, node_id):

View File

@ -15,6 +15,7 @@
"""Tests for manipulating Nodes via the DB API"""
import copy
import datetime
from unittest import mock
@ -29,6 +30,7 @@ from sqlalchemy.orm import exc as sa_orm_exc
from ironic.common import exception
from ironic.common import states
from ironic.common import utils as common_utils
from ironic.db.sqlalchemy import api as dbapi
from ironic.db.sqlalchemy.api import Connection as db_conn
from ironic.db.sqlalchemy.models import NodeInventory
@ -1037,6 +1039,39 @@ class DbNodeTestCase(base.DbTestCase):
res = self.dbapi.get_node_by_uuid(uuid)
self.assertEqual(r1, res.reservation)
def test_reserve_node_reads_reservation_once_sqlite(self):
node = utils.create_test_node()
uuid = node.uuid
r1 = 'fake-reservation'
with mock.patch.object(db_conn, '_get_node_reservation',
autospec=True) as mock_get_res:
mock_get_res.return_value = node
self.dbapi.reserve_node(r1, uuid)
mock_get_res.assert_called_once_with(mock.ANY, node.uuid)
@mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True)
def test_reserve_node_reads_reservation_twice(self, is_sqlite_mock):
# Ensure we re-query for who holds the reservation *when* lock fails
# to trigger.
node = utils.create_test_node()
uuid = node.uuid
is_sqlite_mock.return_value = False
r1 = 'fake-reservation'
self.dbapi.update_node(node.id, {'reservation': r1})
locked_node = copy.copy(node)
locked_node.reservation = r1
with mock.patch.object(db_conn, '_get_node_reservation',
autospec=True) as mock_get_res:
mock_get_res.side_effect = [node, locked_node]
self.assertRaisesRegex(exception.NodeLocked,
'locked by host fake-reservation',
self.dbapi.reserve_node, r1, uuid)
mock_get_res.assert_has_calls([
mock.call(mock.ANY, node.uuid),
mock.call(mock.ANY, node.id)])
def test_release_reservation(self):
node = utils.create_test_node()
uuid = node.uuid