From 3f9151163e6bdf27da80b01c507deef6f4479655 Mon Sep 17 00:00:00 2001 From: Steve Baker Date: Thu, 5 Oct 2023 14:59:45 +1300 Subject: [PATCH] Introduce conductor touch while offline This adds an `online` argument to the conductor touch methods so that touch can be called with `online=False`. When called periodically this allows the conductor `updated_at` to be within the threshold to avoid locked nodes being failed as orphans by another conductor. This will be used by drain shutdown (and graceful shutdown) so that tasks can complete on existing locked nodes within the shutdown timeout, while the conductor is also removed from the hash ring so new tasks are not started on that conductor. This change introduces the api but the existing behaviour won't change until BaseConductorManager.del_host() no longer calls keepalive_halt(). Change-Id: Iedd62193fac1009137b9ee47a6ef5a9a8576f261 --- ironic/conductor/base_manager.py | 7 +++++-- ironic/db/api.py | 8 +++++++- ironic/db/sqlalchemy/api.py | 4 ++-- ironic/objects/conductor.py | 4 ++-- ironic/tests/unit/conductor/test_base_manager.py | 2 +- ironic/tests/unit/db/test_conductor.py | 15 +++++++++++++++ ironic/tests/unit/objects/test_conductor.py | 2 +- 7 files changed, 33 insertions(+), 9 deletions(-) diff --git a/ironic/conductor/base_manager.py b/ironic/conductor/base_manager.py index 7287ea4ebf..ef97fd1135 100644 --- a/ironic/conductor/base_manager.py +++ b/ironic/conductor/base_manager.py @@ -321,13 +321,16 @@ class BaseConductorManager(object): # This is only used in tests currently. Delete it? self._periodic_task_callables = periodic_task_callables + def keepalive_halt(self): + self._keepalive_evt.set() + def del_host(self, deregister=True, clear_node_reservations=True): # Conductor deregistration fails if called on non-initialized # conductor (e.g. when rpc server is unreachable). if not hasattr(self, 'conductor'): return self._shutdown = True - self._keepalive_evt.set() + self.keepalive_halt() if clear_node_reservations: # clear all locks held by this conductor before deregistering @@ -469,7 +472,7 @@ class BaseConductorManager(object): return while not self._keepalive_evt.is_set(): try: - self.conductor.touch() + self.conductor.touch(online=not self._shutdown) except db_exception.DBConnectionError: LOG.warning('Conductor could not connect to database ' 'while heartbeating.') diff --git a/ironic/db/api.py b/ironic/db/api.py index 8aeda4b293..f5a097a887 100644 --- a/ironic/db/api.py +++ b/ironic/db/api.py @@ -585,10 +585,16 @@ class Connection(object, metaclass=abc.ABCMeta): """ @abc.abstractmethod - def touch_conductor(self, hostname): + def touch_conductor(self, hostname, online=True): """Mark a conductor as active by updating its 'updated_at' property. + Calling periodically with ``online=False`` will result in the conductor + appearing unregistered, but recently enough to prevent other conductors + failing orphan nodes. This improves the behaviour of graceful and drain + shutdown. + :param hostname: The hostname of this conductor service. + :param online: Whether the conductor is online. :raises: ConductorNotFound """ diff --git a/ironic/db/sqlalchemy/api.py b/ironic/db/sqlalchemy/api.py index a6ab523e01..b3bda9802c 100644 --- a/ironic/db/sqlalchemy/api.py +++ b/ironic/db/sqlalchemy/api.py @@ -1392,13 +1392,13 @@ class Connection(api.Connection): raise exception.ConductorNotFound(conductor=hostname) @oslo_db_api.retry_on_deadlock - def touch_conductor(self, hostname): + def touch_conductor(self, hostname, online=True): with _session_for_write() as session: query = sa.update(models.Conductor).where( models.Conductor.hostname == hostname ).values({ 'updated_at': timeutils.utcnow(), - 'online': True} + 'online': online} ).execution_options(synchronize_session=False) res = session.execute(query) count = res.rowcount diff --git a/ironic/objects/conductor.py b/ironic/objects/conductor.py index 307e218c5f..6c35803173 100644 --- a/ironic/objects/conductor.py +++ b/ironic/objects/conductor.py @@ -111,9 +111,9 @@ class Conductor(base.IronicObject, object_base.VersionedObjectDictCompat): # methods can be used in the future to replace current explicit RPC calls. # Implications of calling new remote procedures should be thought through. # @object_base.remotable - def touch(self, context=None): + def touch(self, context=None, online=True): """Touch this conductor's DB record, marking it as up-to-date.""" - self.dbapi.touch_conductor(self.hostname) + self.dbapi.touch_conductor(self.hostname, online=online) # NOTE(xek): We don't want to enable RPC on this call just yet. Remotable # methods can be used in the future to replace current explicit RPC calls. diff --git a/ironic/tests/unit/conductor/test_base_manager.py b/ironic/tests/unit/conductor/test_base_manager.py index 8838cff076..b8b60b01ba 100644 --- a/ironic/tests/unit/conductor/test_base_manager.py +++ b/ironic/tests/unit/conductor/test_base_manager.py @@ -359,7 +359,7 @@ class KeepAliveTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): mock_is_sqlite.return_value = False self.service._conductor_service_record_keepalive() self.assertEqual(1, mock_is_sqlite.call_count) - mock_touch.assert_called_once_with(self.hostname) + mock_touch.assert_called_once_with(self.hostname, online=True) @mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True) def test__conductor_service_record_keepalive_failed_db_conn( diff --git a/ironic/tests/unit/db/test_conductor.py b/ironic/tests/unit/db/test_conductor.py index a16ef08b5c..59c280076f 100644 --- a/ironic/tests/unit/db/test_conductor.py +++ b/ironic/tests/unit/db/test_conductor.py @@ -156,6 +156,21 @@ class DbConductorTestCase(base.DbTestCase): c = self.dbapi.get_conductor(c.hostname) self.assertEqual(test_time, timeutils.normalize_time(c.updated_at)) + @mock.patch.object(timeutils, 'utcnow', autospec=True) + def test_touch_conductor_offline(self, mock_utcnow): + test_time = datetime.datetime(2000, 1, 1, 0, 0) + mock_utcnow.return_value = test_time + c = self._create_test_cdr() + self.assertEqual(test_time, timeutils.normalize_time(c.updated_at)) + + test_time = datetime.datetime(2000, 1, 1, 0, 1) + mock_utcnow.return_value = test_time + self.dbapi.touch_conductor(c.hostname, online=False) + self.assertRaises( + exception.ConductorNotFound, + self.dbapi.get_conductor, + c.hostname) + def test_touch_conductor_not_found(self): # A conductor's heartbeat will not create a new record, # it will only update existing ones diff --git a/ironic/tests/unit/objects/test_conductor.py b/ironic/tests/unit/objects/test_conductor.py index 4b3c954904..f9c7c37c59 100644 --- a/ironic/tests/unit/objects/test_conductor.py +++ b/ironic/tests/unit/objects/test_conductor.py @@ -77,7 +77,7 @@ class TestConductorObject(db_base.DbTestCase): c = objects.Conductor.get_by_hostname(self.context, host) c.touch(self.context) mock_get_cdr.assert_called_once_with(host, online=True) - mock_touch_cdr.assert_called_once_with(host) + mock_touch_cdr.assert_called_once_with(host, online=True) def test_refresh(self): host = self.fake_conductor['hostname']