From bb02c49def65cc086565b159763bdee34538c8d9 Mon Sep 17 00:00:00 2001 From: Julia Kreger Date: Fri, 1 Sep 2023 06:49:29 -0700 Subject: [PATCH] DB: Only re-query for a lock holder if we cannot lock Dbapi method _reserve_node_place_lock is a bit of a special method. It has both a decorator to retry sqlite "database is locked" issues, and an outer synchronized process fair lock (from oslo.concurrency.lockutils), which ensures only *one* thread is working on locks at a time. Thing is, we can build contention when a stack of heartbeats come in, because they are forced to execute in serialized fashion. And whil investigating some metal3 logs, we could see some lock interactions are basically instant, and when things begin to get backed up, we start seeing 10+ second gaps where we are trying to get ahold of the database, and can't lock the node. And looking at the code for the method, I realized we were *always* re-querying the node, but never returning it after updating the node. Apparently, so we can just log *if* there was an issue. Instead, just consult the result set and then re-query if we must to determine *who* holds the lock, we now only do so *if* we are operating without SQLite, because if we are then we can safely assume the lock came from another thread. Change-Id: Ie606439670be21cf267eb541ce864711d2097207 --- ironic/db/sqlalchemy/api.py | 9 +++++--- ironic/tests/unit/db/test_nodes.py | 35 ++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/ironic/db/sqlalchemy/api.py b/ironic/db/sqlalchemy/api.py index bf591ca3de..f8b3adddd5 100644 --- a/ironic/db/sqlalchemy/api.py +++ b/ironic/db/sqlalchemy/api.py @@ -714,14 +714,17 @@ class Connection(api.Connection): values(reservation=tag). execution_options(synchronize_session=False)) session.flush() - node = self._get_node_reservation(node.id) # NOTE(TheJulia): In SQLAlchemy 2.0 style, we don't # magically get a changed node as they moved from the # many ways to do things to singular ways to do things. if res.rowcount != 1: # Nothing updated and node exists. Must already be - # locked. - raise exception.NodeLocked(node=node.uuid, host=node.reservation) + # locked. Identify who holds it and log. + if utils.is_ironic_using_sqlite(): + lock_holder = CONF.hostname + else: + lock_holder = self._get_node_reservation(node.id).reservation + raise exception.NodeLocked(node=node.uuid, host=lock_holder) @oslo_db_api.retry_on_deadlock def reserve_node(self, tag, node_id): diff --git a/ironic/tests/unit/db/test_nodes.py b/ironic/tests/unit/db/test_nodes.py index 3dee718eb1..b10d367514 100644 --- a/ironic/tests/unit/db/test_nodes.py +++ b/ironic/tests/unit/db/test_nodes.py @@ -15,6 +15,7 @@ """Tests for manipulating Nodes via the DB API""" +import copy import datetime from unittest import mock @@ -29,6 +30,7 @@ from sqlalchemy.orm import exc as sa_orm_exc from ironic.common import exception from ironic.common import states +from ironic.common import utils as common_utils from ironic.db.sqlalchemy import api as dbapi from ironic.db.sqlalchemy.api import Connection as db_conn from ironic.db.sqlalchemy.models import NodeInventory @@ -1037,6 +1039,39 @@ class DbNodeTestCase(base.DbTestCase): res = self.dbapi.get_node_by_uuid(uuid) self.assertEqual(r1, res.reservation) + def test_reserve_node_reads_reservation_once_sqlite(self): + node = utils.create_test_node() + uuid = node.uuid + + r1 = 'fake-reservation' + + with mock.patch.object(db_conn, '_get_node_reservation', + autospec=True) as mock_get_res: + mock_get_res.return_value = node + self.dbapi.reserve_node(r1, uuid) + mock_get_res.assert_called_once_with(mock.ANY, node.uuid) + + @mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True) + def test_reserve_node_reads_reservation_twice(self, is_sqlite_mock): + # Ensure we re-query for who holds the reservation *when* lock fails + # to trigger. + node = utils.create_test_node() + uuid = node.uuid + is_sqlite_mock.return_value = False + r1 = 'fake-reservation' + self.dbapi.update_node(node.id, {'reservation': r1}) + locked_node = copy.copy(node) + locked_node.reservation = r1 + with mock.patch.object(db_conn, '_get_node_reservation', + autospec=True) as mock_get_res: + mock_get_res.side_effect = [node, locked_node] + self.assertRaisesRegex(exception.NodeLocked, + 'locked by host fake-reservation', + self.dbapi.reserve_node, r1, uuid) + mock_get_res.assert_has_calls([ + mock.call(mock.ANY, node.uuid), + mock.call(mock.ANY, node.id)]) + def test_release_reservation(self): node = utils.create_test_node() uuid = node.uuid