From dd18c70543db6f42e531db24606ed8008015828f Mon Sep 17 00:00:00 2001 From: Ruby Loo Date: Thu, 16 Jan 2020 15:55:29 +0000 Subject: [PATCH] Nodes in maintenance didn't fail, when they should have In this code in base_manager.py, _fail_if_in_state() [1]: if the node is in maintenance, nothing is done. This means that when a node in maintenance is in mid deployment or cleaning and their conductor dies, it won't get put into a failed state [2]. This fixes it. [1] https://opendev.org/openstack/ironic/src/commit/8294afa6231629f9734f19ea5b3b0253ee9b8957/ironic/conductor/base_manager.py#L485 [2] https://opendev.org/openstack/ironic/src/commit/8294afa6231629f9734f19ea5b3b0253ee9b8957/ironic/conductor/base_manager.py#L235 Story #2007098 Task #38134 Change-Id: Ide70619271455685d09671ae16d744fc9ae58a02 --- ironic/conductor/base_manager.py | 12 ++++++---- ironic/conductor/manager.py | 1 + .../tests/unit/conductor/test_base_manager.py | 23 +++++++++++++++++++ ironic/tests/unit/conductor/test_manager.py | 1 + ...-in-maintenance-fail-afd0eace24fa28be.yaml | 8 +++++++ 5 files changed, 41 insertions(+), 4 deletions(-) create mode 100644 releasenotes/notes/node-in-maintenance-fail-afd0eace24fa28be.yaml diff --git a/ironic/conductor/base_manager.py b/ironic/conductor/base_manager.py index d610d2163b..8ef568d830 100644 --- a/ironic/conductor/base_manager.py +++ b/ironic/conductor/base_manager.py @@ -476,15 +476,19 @@ class BaseConductorManager(object): node_iter = self.iter_nodes(filters=filters, sort_key=sort_key, sort_dir='asc') - + desired_maintenance = filters.get('maintenance') workers_count = 0 for node_uuid, driver, conductor_group in node_iter: try: with task_manager.acquire(context, node_uuid, purpose='node state check') as task: - if (task.node.maintenance - or task.node.provision_state - not in provision_state): + # Check maintenance value since it could have changed + # after the filtering was done. + if (desired_maintenance is not None + and desired_maintenance != task.node.maintenance): + continue + + if task.node.provision_state not in provision_state: continue target_state = (None if not keep_target_state else diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index 8cc5a1c0b3..05abed1e11 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -3188,6 +3188,7 @@ class ConductorManager(base_manager.BaseConductorManager): callback_timeout = CONF.conductor.inspect_wait_timeout filters = {'reserved': False, + 'maintenance': False, 'provision_state': states.INSPECTWAIT, 'inspection_started_before': callback_timeout} sort_key = 'inspection_started_at' diff --git a/ironic/tests/unit/conductor/test_base_manager.py b/ironic/tests/unit/conductor/test_base_manager.py index 02bd171f92..f8ba77c3fa 100644 --- a/ironic/tests/unit/conductor/test_base_manager.py +++ b/ironic/tests/unit/conductor/test_base_manager.py @@ -521,3 +521,26 @@ class StartConsolesTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): self.assertIsNone(test_node.last_error) self.assertTrue(log_mock.warning.called) self.assertFalse(mock_notify.called) + + +class MiscTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): + def setUp(self): + super(MiscTestCase, self).setUp() + self._start_service() + + def test__fail_transient_state(self): + node = obj_utils.create_test_node(self.context, + driver='fake-hardware', + provision_state=states.DEPLOYING) + self.service._fail_transient_state(states.DEPLOYING, 'unknown err') + node.refresh() + self.assertEqual(states.DEPLOYFAIL, node.provision_state) + + def test__fail_transient_state_maintenance(self): + node = obj_utils.create_test_node(self.context, + driver='fake-hardware', + maintenance=True, + provision_state=states.DEPLOYING) + self.service._fail_transient_state(states.DEPLOYING, 'unknown err') + node.refresh() + self.assertEqual(states.DEPLOYFAIL, node.provision_state) diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index f1ca254004..b1a27af1a2 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -8103,6 +8103,7 @@ class ManagerCheckInspectWaitTimeoutsTestCase(mgr_utils.CommonMixIn, self.task2 = self._create_task(node=self.node2) self.filters = {'reserved': False, + 'maintenance': False, 'inspection_started_before': 300, 'provision_state': states.INSPECTWAIT} self.columns = ['uuid', 'driver', 'conductor_group'] diff --git a/releasenotes/notes/node-in-maintenance-fail-afd0eace24fa28be.yaml b/releasenotes/notes/node-in-maintenance-fail-afd0eace24fa28be.yaml new file mode 100644 index 0000000000..a5c54669d8 --- /dev/null +++ b/releasenotes/notes/node-in-maintenance-fail-afd0eace24fa28be.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + If a node is in mid-deployment or cleaning and its conductor dies, ironic + will move that node into a failed state. However, this wasn't being done + if those nodes were also in maintenance. This has been fixed. See + `story 2007098 `_ for + more details.