diff --git a/ironic/common/async_steps.py b/ironic/common/async_steps.py index 8bec847627..d21fc42801 100644 --- a/ironic/common/async_steps.py +++ b/ironic/common/async_steps.py @@ -106,7 +106,8 @@ def set_node_flags(node, reboot=None, skip_current_step=None, polling=None, :param polling: Boolean value to set for node's driver_internal_info flag deployment_polling, servicing_polling or cleaning_polling. If it is None, the corresponding polling flag is not set in the node's - driver_internal_info. + driver_internal_info. A polling flag is otherwise deleted from + the node's driver_internal_info. :param step_type: The type of steps to process: 'clean', 'service' or 'deploy'. If None, detected from the node. """ @@ -134,6 +135,9 @@ def set_node_flags(node, reboot=None, skip_current_step=None, polling=None, node.set_driver_internal_info(skip_field, skip_current_step) if polling is not None: node.set_driver_internal_info(polling_field, polling) + else: + # Always unset the value *if* previously set. + node.del_driver_internal_info(polling_field) node.save() diff --git a/ironic/conductor/cleaning.py b/ironic/conductor/cleaning.py index 044b64baf6..9c8b052805 100644 --- a/ironic/conductor/cleaning.py +++ b/ironic/conductor/cleaning.py @@ -175,6 +175,11 @@ def do_next_clean_step(task, step_index, disable_ramdisk=None): eocn = step.get('execute_on_child_nodes', False) result = None try: + if async_steps.CLEANING_POLLING in node.driver_internal_info: + # We're executing a new step, so a prior polling + # flag should be dictated by the new step, *not* a + # prior step. + node.del_driver_internal_info(async_steps.CLEANING_POLLING) if not eocn: LOG.info('Executing %(step)s on node %(node)s', {'step': step, 'node': node.uuid}) diff --git a/ironic/conductor/deployments.py b/ironic/conductor/deployments.py index 06d7ffa308..6a4dbcf074 100644 --- a/ironic/conductor/deployments.py +++ b/ironic/conductor/deployments.py @@ -340,6 +340,10 @@ def do_next_deploy_step(task, step_index): child_node_execution = step.get('execute_on_child_nodes', False) result = None try: + if async_steps.DEPLOYMENT_POLLING in node.driver_internal_info: + # We're going to execute a new step, we should delete any + # older/out of date option. + node.del_driver_internal_info(async_steps.DEPLOYMENT_POLLING) if not child_node_execution: interface = getattr(task.driver, step.get('interface')) LOG.info('Executing %(step)s on node %(node)s', diff --git a/ironic/conductor/servicing.py b/ironic/conductor/servicing.py index 3264643926..8b385d7c7d 100644 --- a/ironic/conductor/servicing.py +++ b/ironic/conductor/servicing.py @@ -133,6 +133,10 @@ def do_next_service_step(task, step_index, disable_ramdisk=None): eocn = step.get('execute_on_child_nodes', False) result = None try: + if async_steps.SERVICING_POLLING in node.driver_internal_info: + # We're going to execute a new step, we should delete any + # older/out of date option. + node.del_driver_internal_info(async_steps.SERVICING_POLLING) if not eocn: LOG.info('Executing %(step)s on node %(node)s', {'step': step, 'node': node.uuid}) diff --git a/ironic/tests/unit/conductor/test_cleaning.py b/ironic/tests/unit/conductor/test_cleaning.py index 1906ce430b..6f17585706 100644 --- a/ironic/tests/unit/conductor/test_cleaning.py +++ b/ironic/tests/unit/conductor/test_cleaning.py @@ -607,7 +607,8 @@ class DoNodeCleanTestCase(db_base.DbTestCase): target_provision_state=tgt_prov_state, last_error=None, driver_internal_info={'clean_steps': self.clean_steps, - 'clean_step_index': 0}, + 'clean_step_index': 0, + 'cleaning_polling': True}, clean_step=self.clean_steps[0]) mock_execute.return_value = return_state @@ -623,6 +624,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase): self.assertEqual(1, node.driver_internal_info['clean_step_index']) mock_execute.assert_called_once_with( mock.ANY, mock.ANY, self.clean_steps[1]) + self.assertNotIn('cleaning_polling', node.driver_internal_info) def test_do_next_clean_step_continue_from_last_cleaning(self): self._do_next_clean_step_continue_from_last_cleaning(states.CLEANWAIT) diff --git a/ironic/tests/unit/conductor/test_deployments.py b/ironic/tests/unit/conductor/test_deployments.py index c96a6fae1a..a9a6e493b8 100644 --- a/ironic/tests/unit/conductor/test_deployments.py +++ b/ironic/tests/unit/conductor/test_deployments.py @@ -667,7 +667,8 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin, autospec=True) def test__do_next_deploy_step_in_deploywait(self, mock_execute): driver_internal_info = {'deploy_step_index': None, - 'deploy_steps': self.deploy_steps} + 'deploy_steps': self.deploy_steps, + 'deployment_polling': True} self._start_service() node = obj_utils.create_test_node( self.context, driver='fake-hardware', @@ -692,6 +693,7 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin, self.assertEqual(states.ACTIVE, node.target_provision_state) self.assertEqual(expected_first_step, node.deploy_step) self.assertEqual(0, node.driver_internal_info['deploy_step_index']) + self.assertNotIn('deployment_polling', node.driver_internal_info) mock_execute.assert_called_once_with(mock.ANY, task, self.deploy_steps[0]) diff --git a/ironic/tests/unit/conductor/test_servicing.py b/ironic/tests/unit/conductor/test_servicing.py index 81e0d5d7e5..22c2612d96 100644 --- a/ironic/tests/unit/conductor/test_servicing.py +++ b/ironic/tests/unit/conductor/test_servicing.py @@ -374,7 +374,8 @@ class DoNodeServiceTestCase(db_base.DbTestCase): target_provision_state=tgt_prov_state, last_error=None, driver_internal_info={'service_steps': self.service_steps, - 'service_step_index': 0}, + 'service_step_index': 0, + 'servicing_polling': True}, service_step=self.service_steps[0]) mock_execute.return_value = return_state @@ -390,6 +391,7 @@ class DoNodeServiceTestCase(db_base.DbTestCase): self.assertEqual(1, node.driver_internal_info['service_step_index']) mock_execute.assert_called_once_with( mock.ANY, mock.ANY, self.service_steps[1]) + self.assertNotIn('servicing_polling', node.driver_internal_info) def test_do_next_clean_step_continue_from_last_cleaning(self): self._do_next_clean_step_continue_from_last_cleaning( diff --git a/ironic/tests/unit/drivers/modules/test_deploy_utils.py b/ironic/tests/unit/drivers/modules/test_deploy_utils.py index c8940ff5ec..c9f6854581 100644 --- a/ironic/tests/unit/drivers/modules/test_deploy_utils.py +++ b/ironic/tests/unit/drivers/modules/test_deploy_utils.py @@ -3307,3 +3307,15 @@ class AsyncStepTestCase(db_base.DbTestCase): skip_current_step=True, polling=True) self.assertEqual(expected, self.node.driver_internal_info) + + def test_set_async_step_flags_clears_polling_if_not_set(self): + self.node.clean_step = {'step': 'create_configuration', + 'interface': 'raid'} + self.node.driver_internal_info = {'agent_secret_token': 'test', + 'cleaning_polling': True} + expected = {'cleaning_reboot': True, + 'skip_current_clean_step': True} + self.node.save() + utils.set_async_step_flags(self.node, reboot=True, + skip_current_step=True) + self.assertEqual(expected, self.node.driver_internal_info) diff --git a/releasenotes/notes/fix-polling-lockout-for-steps-b9645f0cae18da1e.yaml b/releasenotes/notes/fix-polling-lockout-for-steps-b9645f0cae18da1e.yaml new file mode 100644 index 0000000000..d5e22db4e6 --- /dev/null +++ b/releasenotes/notes/fix-polling-lockout-for-steps-b9645f0cae18da1e.yaml @@ -0,0 +1,13 @@ +--- +fixes: + - | + Fixes an issue where operators executing complex arrangement of steps + which include out-of-band and in-band steps, for example a hardware + RAID ``create_configuration`` step followed by in-band steps inside of + the agent, would effectively get the agent stuck in a ``wait`` state in + the Cleaning, Servicing, or Deploying workflows. + This was related to the way out-of-band steps are executed and monitored. + Ironic, before starting to execute a new step, now cleans the polling + lockout flag for the respective workflow being executed to prevent the + agent from getting stuck. For more information, please see + `bug 2096938 `_. diff --git a/tox.ini b/tox.ini index 1048b3b0a3..7a85d9312f 100644 --- a/tox.ini +++ b/tox.ini @@ -165,7 +165,7 @@ filename = *.py,app.wsgi exclude=.*,dist,doc,*lib/python*,*egg,build import-order-style = pep8 application-import-names = ironic -max-complexity=19 +max-complexity=20 # [H106] Don't put vim configuration in source files. # [H203] Use assertIs(Not)None to check for None. # [H204] Use assert(Not)Equal to check for equality.