From 42e2685395d393c0d4cd840f7c23900df7300caa Mon Sep 17 00:00:00 2001 From: Julia Kreger Date: Wed, 8 Jun 2016 18:39:09 -0400 Subject: [PATCH] Add cleanwait timeout cleanup process Previously, if a node in a cleaning state timed out, the timeout process would not purge certain items from the node's configuration which resulted in a short circuiting of the logic cleaning being retried. This was a result of the node clean_step configuration not being purged upon a timeout occuring. This change adds a wrapper method around the cleaning failure error handler to allow the _fail_if_in_state method to call the error handler, since error handler syntax is not uniform and the _fail_if_in_state cannot pass arguments. It also changes the cleaning error handler to permit the error handler to delete the node clean_step, and cleaning related driver_internal_info configuration from a node in the event the node in in CLEANFAIL state. Change-Id: I9ee5c0b385648c9b7e1d330d5d1af9b2c486a436 Closes-Bug: #1590146 --- ironic/conductor/manager.py | 7 ++--- ironic/conductor/utils.py | 18 ++++++++++- ironic/tests/unit/conductor/test_manager.py | 13 +++++++- ironic/tests/unit/conductor/test_utils.py | 31 +++++++++++++++++++ .../cleaning-retry-fix-89a5d0e65920a064.yaml | 7 +++++ 5 files changed, 69 insertions(+), 7 deletions(-) create mode 100644 releasenotes/notes/cleaning-retry-fix-89a5d0e65920a064.yaml diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index 8ab608c2f1..524810e3c6 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -1356,13 +1356,10 @@ class ConductorManager(base_manager.BaseConductorManager): 'provision_state': states.CLEANWAIT, 'maintenance': False, 'provisioned_before': callback_timeout} - last_error = _("Timeout reached while cleaning the node. Please " - "check if the ramdisk responsible for the cleaning is " - "running on the node.") self._fail_if_in_state(context, filters, states.CLEANWAIT, 'provision_updated_at', - last_error=last_error, - keep_target_state=True) + keep_target_state=True, + callback_method=utils.cleanup_cleanwait_timeout) @periodics.periodic(spacing=CONF.conductor.sync_local_state_interval) def _sync_local_state(self, context): diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 0aac967a09..e602ae04d1 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -202,11 +202,27 @@ def provisioning_error_handler(e, node, provision_state, 'tgt_prov_state': target_provision_state}) +def cleanup_cleanwait_timeout(task): + """Cleanup a cleaning task after timeout. + + :param task: a TaskManager instance. + """ + last_error = (_("Timeout reached while cleaning the node. Please " + "check if the ramdisk responsible for the cleaning is " + "running on the node. Failed on step %(step)s.") % + {'step': task.node.clean_step}) + cleaning_error_handler(task, msg=last_error, + set_fail_state=True) + + def cleaning_error_handler(task, msg, tear_down_cleaning=True, set_fail_state=True): """Put a failed node in CLEANFAIL and maintenance.""" node = task.node - if node.provision_state in (states.CLEANING, states.CLEANWAIT): + if node.provision_state in ( + states.CLEANING, + states.CLEANWAIT, + states.CLEANFAIL): # Clear clean step, msg should already include current step node.clean_step = {} info = node.driver_internal_info diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index e02523e2f9..954400e6ee 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -1100,7 +1100,13 @@ class DoNodeDeployTearDownTestCase(mgr_utils.ServiceSetUpMixin, self.context, driver='fake', provision_state=states.CLEANWAIT, target_provision_state=tgt_prov_state, - provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0)) + provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0), + clean_step={ + 'interface': 'deploy', + 'step': 'erase_devices'}, + driver_internal_info={ + 'cleaning_reboot': manual, + 'clean_step_index': 0}) self.service._check_cleanwait_timeouts(self.context) self._stop_service() @@ -1108,6 +1114,11 @@ class DoNodeDeployTearDownTestCase(mgr_utils.ServiceSetUpMixin, self.assertEqual(states.CLEANFAIL, node.provision_state) self.assertEqual(tgt_prov_state, node.target_provision_state) self.assertIsNotNone(node.last_error) + # Test that cleaning parameters have been purged in order + # to prevent looping of the cleaning sequence + self.assertEqual({}, node.clean_step) + self.assertNotIn('clean_step_index', node.driver_internal_info) + self.assertNotIn('cleaning_reboot', node.driver_internal_info) def test__check_cleanwait_timeouts_automated_clean(self): self._check_cleanwait_timeouts() diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py index a04a1c8f48..35d384a324 100644 --- a/ironic/tests/unit/conductor/test_utils.py +++ b/ironic/tests/unit/conductor/test_utils.py @@ -572,6 +572,37 @@ class ErrorHandlersTestCase(tests_base.TestCase): self.assertFalse(self.node.save.called) self.assertFalse(log_mock.warning.called) + @mock.patch.object(conductor_utils, 'cleaning_error_handler') + def test_cleanup_cleanwait_timeout_handler_call(self, mock_error_handler): + self.node.clean_step = {} + conductor_utils.cleanup_cleanwait_timeout(self.task) + + mock_error_handler.assert_called_once_with( + self.task, + msg="Timeout reached while cleaning the node. Please " + "check if the ramdisk responsible for the cleaning is " + "running on the node. Failed on step {}.", + set_fail_state=True) + + def test_cleanup_cleanwait_timeout(self): + self.node.provision_state = states.CLEANFAIL + target = 'baz' + self.node.target_provision_state = target + self.node.driver_internal_info = {} + self.node.clean_step = {'key': 'val'} + clean_error = ("Timeout reached while cleaning the node. Please " + "check if the ramdisk responsible for the cleaning is " + "running on the node. Failed on step {'key': 'val'}.") + self.node.driver_internal_info = { + 'cleaning_reboot': True, + 'clean_step_index': 0} + conductor_utils.cleanup_cleanwait_timeout(self.task) + self.assertEqual({}, self.node.clean_step) + self.assertNotIn('clean_step_index', self.node.driver_internal_info) + self.task.process_event.assert_called_once() + self.assertTrue(self.node.maintenance) + self.assertEqual(clean_error, self.node.maintenance_reason) + def test_cleaning_error_handler(self): self.node.provision_state = states.CLEANING target = 'baz' diff --git a/releasenotes/notes/cleaning-retry-fix-89a5d0e65920a064.yaml b/releasenotes/notes/cleaning-retry-fix-89a5d0e65920a064.yaml new file mode 100644 index 0000000000..73b7e9253b --- /dev/null +++ b/releasenotes/notes/cleaning-retry-fix-89a5d0e65920a064.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - A bug has been corrected where a node's current clean_step + was not purged upon that node timing out from a CLEANWAIT state. + Previously, this bug would prevent a user from retrying cleaning + operations. For more information, + see https://bugs.launchpad.net/ironic/+bug/1590146.