diff --git a/ironic/common/exception.py b/ironic/common/exception.py index d5722e36d2..62a685da43 100644 --- a/ironic/common/exception.py +++ b/ironic/common/exception.py @@ -791,3 +791,7 @@ class BIOSSettingListNotFound(NotFound): class DatabaseVersionTooOld(IronicException): _msg_fmt = _("Database version is too old") + + +class AgentConnectionFailed(IronicException): + _msg_fmt = _("Connection to agent failed: %(reason)s") diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index 8582e8fbb7..2ee2bd0310 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -1311,6 +1311,19 @@ class ConductorManager(base_manager.BaseConductorManager): try: result = interface.execute_clean_step(task, step) except Exception as e: + if isinstance(e, exception.AgentConnectionFailed): + if task.node.driver_internal_info.get('cleaning_reboot'): + LOG.info('Agent is not yet running on node %(node)s ' + 'after cleaning reboot, waiting for agent to ' + 'come up to run next clean step %(step)s.', + {'node': node.uuid, 'step': step}) + driver_internal_info['skip_current_clean_step'] = False + node.driver_internal_info = driver_internal_info + target_state = (states.MANAGEABLE if manual_clean + else None) + task.process_event('wait', target_state=target_state) + return + msg = (_('Node %(node)s failed step %(step)s: ' '%(exc)s') % {'node': node.uuid, 'exc': e, @@ -1345,6 +1358,7 @@ class ConductorManager(base_manager.BaseConductorManager): driver_internal_info = node.driver_internal_info driver_internal_info['clean_steps'] = None driver_internal_info.pop('clean_step_index', None) + driver_internal_info.pop('cleaning_reboot', None) node.driver_internal_info = driver_internal_info node.save() try: diff --git a/ironic/drivers/modules/agent_base_vendor.py b/ironic/drivers/modules/agent_base_vendor.py index a332a77afb..71a56c90d9 100644 --- a/ironic/drivers/modules/agent_base_vendor.py +++ b/ironic/drivers/modules/agent_base_vendor.py @@ -466,6 +466,7 @@ class AgentDeployMixin(HeartbeatMixin): info = task.node.driver_internal_info info.pop('cleaning_reboot', None) task.node.driver_internal_info = info + task.node.save() manager_utils.notify_conductor_resume_clean(task) return else: diff --git a/ironic/drivers/modules/agent_client.py b/ironic/drivers/modules/agent_client.py index 982980bfdd..af9b90323c 100644 --- a/ironic/drivers/modules/agent_client.py +++ b/ironic/drivers/modules/agent_client.py @@ -81,6 +81,12 @@ class AgentClient(object): try: response = self.session.post(url, params=request_params, data=body) + except requests.ConnectionError as e: + msg = (_('Failed to invoke agent command %(method)s for node ' + '%(node)s. Error: %(error)s') % + {'method': method, 'node': node.uuid, 'error': e}) + LOG.error(msg) + raise exception.AgentConnectionFailed(reason=msg) except requests.RequestException as e: msg = (_('Error invoking agent command %(method)s for node ' '%(node)s. Error: %(error)s') % diff --git a/ironic/drivers/modules/drac/raid.py b/ironic/drivers/modules/drac/raid.py index 2c424951ed..24d44962dd 100644 --- a/ironic/drivers/modules/drac/raid.py +++ b/ironic/drivers/modules/drac/raid.py @@ -896,4 +896,8 @@ class DracRAID(base.RAIDInterface): def _resume_cleaning(self, task): raid_common.update_raid_info( task.node, self.get_logical_disks(task)) + driver_internal_info = task.node.driver_internal_info + driver_internal_info['cleaning_reboot'] = True + task.node.driver_internal_info = driver_internal_info + task.node.save() manager_utils.notify_conductor_resume_clean(task) diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index 9b9dd72c65..80bd7a4664 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -3665,6 +3665,114 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): def test__do_next_clean_step_manual_execute_fail(self): self._do_next_clean_step_execute_fail(manual=True) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step', + autospec=True) + def test_do_next_clean_step_oob_reboot(self, mock_execute): + # When a clean step fails, go to CLEANWAIT + tgt_prov_state = states.MANAGEABLE + + self._start_service() + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.CLEANING, + target_provision_state=tgt_prov_state, + last_error=None, + driver_internal_info={'clean_steps': self.clean_steps, + 'clean_step_index': None, + 'cleaning_reboot': True}, + clean_step={}) + mock_execute.side_effect = exception.AgentConnectionFailed( + reason='failed') + + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + self.service._do_next_clean_step(task, 0) + + self._stop_service() + node.refresh() + + # Make sure we go to CLEANWAIT + self.assertEqual(states.CLEANWAIT, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + self.assertEqual(self.clean_steps[0], node.clean_step) + self.assertEqual(0, node.driver_internal_info['clean_step_index']) + self.assertFalse(node.driver_internal_info['skip_current_clean_step']) + mock_execute.assert_called_once_with( + mock.ANY, mock.ANY, self.clean_steps[0]) + + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step', + autospec=True) + def test_do_next_clean_step_oob_reboot_last_step(self, mock_execute): + # Resume where last_step is the last cleaning step + tgt_prov_state = states.MANAGEABLE + info = {'clean_steps': self.clean_steps, + 'cleaning_reboot': True, + 'clean_step_index': len(self.clean_steps) - 1} + + self._start_service() + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.CLEANING, + target_provision_state=tgt_prov_state, + last_error=None, + driver_internal_info=info, + clean_step=self.clean_steps[-1]) + + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + self.service._do_next_clean_step(task, None) + + self._stop_service() + node.refresh() + + # Cleaning should be complete without calling additional steps + self.assertEqual(tgt_prov_state, node.provision_state) + self.assertEqual(states.NOSTATE, node.target_provision_state) + self.assertEqual({}, node.clean_step) + self.assertNotIn('clean_step_index', node.driver_internal_info) + self.assertNotIn('cleaning_reboot', node.driver_internal_info) + self.assertIsNone(node.driver_internal_info['clean_steps']) + self.assertFalse(mock_execute.called) + + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step', + autospec=True) + @mock.patch.object(fake.FakeDeploy, 'tear_down_cleaning', autospec=True) + def test_do_next_clean_step_oob_reboot_fail(self, tear_mock, + mock_execute): + # When a clean step fails with no reboot requested go to CLEANFAIL + tgt_prov_state = states.MANAGEABLE + + self._start_service() + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.CLEANING, + target_provision_state=tgt_prov_state, + last_error=None, + driver_internal_info={'clean_steps': self.clean_steps, + 'clean_step_index': None}, + clean_step={}) + mock_execute.side_effect = exception.AgentConnectionFailed( + reason='failed') + + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + self.service._do_next_clean_step(task, 0) + tear_mock.assert_called_once_with(task.driver.deploy, task) + + self._stop_service() + node.refresh() + + # Make sure we go to CLEANFAIL, clear clean_steps + self.assertEqual(states.CLEANFAIL, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + self.assertEqual({}, node.clean_step) + self.assertNotIn('clean_step_index', node.driver_internal_info) + self.assertNotIn('skip_current_clean_step', node.driver_internal_info) + self.assertIsNotNone(node.last_error) + self.assertTrue(node.maintenance) + mock_execute.assert_called_once_with( + mock.ANY, mock.ANY, self.clean_steps[0]) + @mock.patch.object(manager, 'LOG', autospec=True) @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step', autospec=True) diff --git a/ironic/tests/unit/drivers/modules/test_agent_client.py b/ironic/tests/unit/drivers/modules/test_agent_client.py index f8396b3cee..6ddc7d2fd0 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_client.py +++ b/ironic/tests/unit/drivers/modules/test_agent_client.py @@ -133,6 +133,24 @@ class TestAgentClient(base.TestCase): {'method': method, 'node': self.node.uuid, 'error': error}, str(e)) + def test__command_fail_connect(self): + error = 'Boom' + self.client.session.post.side_effect = requests.ConnectionError(error) + method = 'foo.bar' + params = {} + + self.client._get_command_url(self.node) + self.client._get_command_body(method, params) + + e = self.assertRaises(exception.AgentConnectionFailed, + self.client._command, + self.node, method, params) + self.assertEqual('Connection to agent failed: Failed to invoke ' + 'agent command %(method)s for node %(node)s. ' + 'Error: %(error)s' % + {'method': method, 'node': self.node.uuid, + 'error': error}, str(e)) + def test__command_error_code(self): response_text = '{"faultstring": "you dun goofd"}' self.client.session.post.return_value = MockResponse( diff --git a/releasenotes/notes/resume-cleaning-post-oob-reboot-b76c23f98219a8d2.yaml b/releasenotes/notes/resume-cleaning-post-oob-reboot-b76c23f98219a8d2.yaml new file mode 100644 index 0000000000..4b1e36a9cf --- /dev/null +++ b/releasenotes/notes/resume-cleaning-post-oob-reboot-b76c23f98219a8d2.yaml @@ -0,0 +1,10 @@ +--- +fixes: + - | + The cleaning operation may fail, if an in-band clean step were to + execute after the completion of out-of-band clean step that + performs reboot of the node. The failure is caused because of race + condition where in cleaning is resumed before the Ironic Python + Agent(IPA) is ready to execute clean steps. This has been fixed. + For more information, see `bug 2002731 + `_.