diff --git a/ironic/common/exception.py b/ironic/common/exception.py index 912121a967..c2e5030e87 100644 --- a/ironic/common/exception.py +++ b/ironic/common/exception.py @@ -813,3 +813,8 @@ class UnknownAttribute(ClientSideError): else: self.fieldname = name super(UnknownAttribute, self).__init__(self.msg) + + +class AgentInProgress(IronicException): + _msg_fmt = _('Node %(node)s command "%(command)s" failed. Agent is ' + 'presently executing a command. Error %(error)s') diff --git a/ironic/conductor/cleaning.py b/ironic/conductor/cleaning.py index c279e2fcdc..269d630042 100644 --- a/ironic/conductor/cleaning.py +++ b/ironic/conductor/cleaning.py @@ -179,6 +179,17 @@ def do_next_clean_step(task, step_index): else None) task.process_event('wait', target_state=target_state) return + if isinstance(e, exception.AgentInProgress): + LOG.info('Conductor attempted to process clean step for ' + 'node %(node)s. Agent indicated it is presently ' + 'executing a command. Error: %(error)s', + {'node': task.node.uuid, + 'error': e}) + driver_internal_info['skip_current_clean_step'] = False + node.driver_internal_info = driver_internal_info + target_state = states.MANAGEABLE if manual_clean else None + task.process_event('wait', target_state=target_state) + return msg = (_('Node %(node)s failed step %(step)s: ' '%(exc)s') % diff --git a/ironic/conductor/deployments.py b/ironic/conductor/deployments.py index 8758ae08d9..d6ab54aead 100644 --- a/ironic/conductor/deployments.py +++ b/ironic/conductor/deployments.py @@ -257,6 +257,16 @@ def do_next_deploy_step(task, step_index): node.driver_internal_info = driver_internal_info task.process_event('wait') return + if isinstance(e, exception.AgentInProgress): + LOG.info('Conductor attempted to process deploy step for ' + 'node %(node)s. Agent indicated it is presently ' + 'executing a command. Error: %(error)s', + {'node': task.node.uuid, + 'error': e}) + driver_internal_info['skip_current_deploy_step'] = False + node.driver_internal_info = driver_internal_info + task.process_event('wait') + return log_msg = ('Node %(node)s failed deploy step %(step)s. Error: ' '%(err)s' % {'node': node.uuid, 'step': node.deploy_step, 'err': e}) diff --git a/ironic/drivers/modules/agent_base.py b/ironic/drivers/modules/agent_base.py index 43d4bb18b5..4efb1c19bf 100644 --- a/ironic/drivers/modules/agent_base.py +++ b/ironic/drivers/modules/agent_base.py @@ -898,11 +898,11 @@ class AgentDeployMixin(HeartbeatMixin, AgentOobStepsMixin): call = getattr(self._client, 'get_%s_steps' % step_type) try: agent_result = call(node, task.ports).get('command_result', {}) - except exception.AgentAPIError as exc: - if 'agent is busy' in str(exc): - LOG.debug('Agent is busy with a command, will refresh steps ' - 'on the next heartbeat') - return + except exception.AgentInProgress as exc: + LOG.debug('Agent for node %(node)s is busy with a command, ' + 'will refresh steps on the next heartbeat.', + {'node': task.node.uuid}) + return # TODO(dtantsur): change to just 'raise' if step_type == 'clean': diff --git a/ironic/drivers/modules/agent_client.py b/ironic/drivers/modules/agent_client.py index a6f9083c3f..f0c411c76b 100644 --- a/ironic/drivers/modules/agent_client.py +++ b/ironic/drivers/modules/agent_client.py @@ -153,6 +153,8 @@ class AgentClient(object): :raises: IronicException when failed to issue the request or there was a malformed response from the agent. :raises: AgentAPIError when agent failed to execute specified command. + :raises: AgentInProgress when the command fails to execute as the agent + is presently executing the prior command. :returns: A dict containing command result from agent, see get_commands_status for a sample. """ @@ -223,6 +225,17 @@ class AgentClient(object): 'Expected 2xx HTTP status code, got %(code)d.', {'method': method, 'node': node.uuid, 'code': response.status_code}) + if (response.status_code == http_client.CONFLICT + or 'agent is busy' in faultstring.lower()): + # HTTP 409 check as an explicit check of if the agent + # is already busy. + # NOTE(TheJulia): The agent sends upper case A as of + # late victoria, but lower case the entire message + # for compatability with pre-late victoria agents + # which returns HTTP 409. + raise exception.AgentInProgress(node=node.uuid, + command=method, + error=faultstring) raise exception.AgentAPIError(node=node.uuid, status=response.status_code, error=faultstring) @@ -337,6 +350,8 @@ class AgentClient(object): :raises: IronicException when failed to issue the request or there was a malformed response from the agent. :raises: AgentAPIError when agent failed to execute specified command. + :raises: AgentInProgress when the command fails to execute as the agent + is presently executing the prior command. :returns: A dict containing command status from agent. See :func:`get_commands_status` for a command result sample. """ @@ -370,6 +385,8 @@ class AgentClient(object): :raises: IronicException when failed to issue the request or there was a malformed response from the agent. :raises: AgentAPIError when agent failed to execute specified command. + :raises: AgentInProgress when the command fails to execute as the agent + is presently executing the prior command. :returns: A dict containing command response from agent. See :func:`get_commands_status` for a command result sample. """ @@ -400,6 +417,8 @@ class AgentClient(object): :raises: IronicException when failed to issue the request or there was a malformed response from the agent. :raises: AgentAPIError when agent failed to execute specified command. + :raises: AgentInProgress when the command fails to execute as the agent + is presently executing the prior command. :returns: A dict containing command response from agent. See :func:`get_commands_status` for a command result sample. """ @@ -456,6 +475,8 @@ class AgentClient(object): :raises: IronicException when failed to issue the request or there was a malformed response from the agent. :raises: AgentAPIError when agent failed to execute specified command. + :raises: AgentInProgress when the command fails to execute as the agent + is presently executing the prior command. :returns: A dict containing command response from agent. See :func:`get_commands_status` for a command result sample. The value of key command_result is in the form of: @@ -487,6 +508,8 @@ class AgentClient(object): :raises: IronicException when failed to issue the request or there was a malformed response from the agent. :raises: AgentAPIError when agent failed to execute specified command. + :raises: AgentInProgress when the command fails to execute as the agent + is presently executing the prior command. :returns: A dict containing command response from agent. See :func:`get_commands_status` for a command result sample. The value of key command_result is in the form of: @@ -550,6 +573,8 @@ class AgentClient(object): :raises: IronicException when failed to issue the request or there was a malformed response from the agent. :raises: AgentAPIError when agent failed to execute specified command. + :raises: AgentInProgress when the command fails to execute as the agent + is presently executing the prior command. :returns: A dict containing command response from agent. See :func:`get_commands_status` for a command result sample. The value of key command_result is in the form of: @@ -581,6 +606,8 @@ class AgentClient(object): :raises: IronicException when failed to issue the request or there was a malformed response from the agent. :raises: AgentAPIError when agent failed to execute specified command. + :raises: AgentInProgress when the command fails to execute as the agent + is presently executing the prior command. :returns: A dict containing command response from agent. """ @@ -597,6 +624,8 @@ class AgentClient(object): :raises: IronicException when failed to issue the request or there was a malformed response from the agent. :raises: AgentAPIError when agent failed to execute specified command. + :raises: AgentInProgress when the command fails to execute as the agent + is presently executing the prior command. :returns: A dict containing command response from agent. See :func:`get_commands_status` for a command result sample. """ @@ -612,6 +641,8 @@ class AgentClient(object): :raises: IronicException when failed to issue the request or there was a malformed response from the agent. :raises: AgentAPIError when agent failed to execute specified command. + :raises: AgentInProgress when the command fails to execute as the agent + is presently executing the prior command. :returns: A dict containing command response from agent. See :func:`get_commands_status` for a command result sample. """ @@ -627,6 +658,8 @@ class AgentClient(object): :raises: IronicException when failed to issue the request or there was a malformed response from the agent. :raises: AgentAPIError when agent failed to execute specified command. + :raises: AgentInProgress when the command fails to execute as the agent + is presently executing the prior command. :returns: A dict containing command response from agent. See :func:`get_commands_status` for a command result sample. """ @@ -643,6 +676,8 @@ class AgentClient(object): :raises: IronicException when failed to issue the request or there was a malformed response from the agent. :raises: AgentAPIError when agent failed to execute specified command. + :raises: AgentInProgress when the command fails to execute as the agent + is presently executing the prior command. :returns: A dict containing command response from agent. See :func:`get_commands_status` for a command result sample. """ @@ -660,6 +695,8 @@ class AgentClient(object): to issue the request, or there was a malformed response from the agent. :raises: AgentAPIError when agent failed to execute specified command. + :raises: AgentInProgress when the command fails to execute as the agent + is presently executing the prior command. :raises: InstanceRescueFailure when the agent ramdisk is too old to support transmission of the rescue password. :returns: A dict containing command response from agent. diff --git a/ironic/tests/unit/conductor/test_cleaning.py b/ironic/tests/unit/conductor/test_cleaning.py index 0bcb1c77f5..8398b38e94 100644 --- a/ironic/tests/unit/conductor/test_cleaning.py +++ b/ironic/tests/unit/conductor/test_cleaning.py @@ -770,6 +770,37 @@ class DoNodeCleanTestCase(db_base.DbTestCase): mock_execute.assert_called_once_with( mock.ANY, mock.ANY, self.clean_steps[0]) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step', + autospec=True) + def test_do_next_clean_step_agent_busy(self, mock_execute): + # When a clean step fails, go to CLEANWAIT + tgt_prov_state = states.MANAGEABLE + + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.CLEANING, + target_provision_state=tgt_prov_state, + last_error=None, + driver_internal_info={'clean_steps': self.clean_steps, + 'clean_step_index': None, + 'cleaning_reboot': True}, + clean_step={}) + mock_execute.side_effect = exception.AgentInProgress( + reason='still meowing') + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + cleaning.do_next_clean_step(task, 0) + + node.refresh() + # Make sure we go to CLEANWAIT + self.assertEqual(states.CLEANWAIT, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + self.assertEqual(self.clean_steps[0], node.clean_step) + self.assertEqual(0, node.driver_internal_info['clean_step_index']) + self.assertFalse(node.driver_internal_info['skip_current_clean_step']) + mock_execute.assert_called_once_with( + mock.ANY, mock.ANY, self.clean_steps[0]) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step', autospec=True) def test_do_next_clean_step_oob_reboot_last_step(self, mock_execute): diff --git a/ironic/tests/unit/conductor/test_deployments.py b/ironic/tests/unit/conductor/test_deployments.py index 6cff78acb9..56ec170728 100644 --- a/ironic/tests/unit/conductor/test_deployments.py +++ b/ironic/tests/unit/conductor/test_deployments.py @@ -769,7 +769,42 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin, self._stop_service() node.refresh() - # Make sure we go to CLEANWAIT + # Make sure we go to DEPLOYWAIT + self.assertEqual(states.DEPLOYWAIT, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + self.assertEqual(self.deploy_steps[0], node.deploy_step) + self.assertEqual(0, node.driver_internal_info['deploy_step_index']) + self.assertFalse(node.driver_internal_info['skip_current_deploy_step']) + mock_execute.assert_called_once_with( + mock.ANY, mock.ANY, self.deploy_steps[0]) + + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step', + autospec=True) + def test_do_next_deploy_step_agent_busy(self, mock_execute): + # When a deploy step fails, go to DEPLOYWAIT + tgt_prov_state = states.ACTIVE + + self._start_service() + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.DEPLOYING, + target_provision_state=tgt_prov_state, + last_error=None, + driver_internal_info={'deploy_steps': self.deploy_steps, + 'deploy_step_index': None, + 'deployment_reboot': True}, + clean_step={}) + mock_execute.side_effect = exception.AgentInProgress( + reason='meow') + + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + deployments.do_next_deploy_step(task, 0) + + self._stop_service() + node.refresh() + + # Make sure we go to DEPLOYWAIT self.assertEqual(states.DEPLOYWAIT, node.provision_state) self.assertEqual(tgt_prov_state, node.target_provision_state) self.assertEqual(self.deploy_steps[0], node.deploy_step) diff --git a/ironic/tests/unit/drivers/modules/test_agent_base.py b/ironic/tests/unit/drivers/modules/test_agent_base.py index 97abaf1d98..73a19c7e34 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_base.py +++ b/ironic/tests/unit/drivers/modules/test_agent_base.py @@ -2223,8 +2223,8 @@ class TestRefreshCleanSteps(AgentDeployMixinBaseTest): @mock.patch.object(agent_client.AgentClient, 'get_deploy_steps', autospec=True) def test_refresh_steps_busy(self, client_mock, log_mock): - client_mock.side_effect = exception.AgentAPIError( - node="node", status="500", error='agent is busy') + client_mock.side_effect = exception.AgentInProgress( + node="node", error='Agent is busy : maximum meowing') with task_manager.acquire( self.context, self.node.uuid, shared=False) as task: diff --git a/ironic/tests/unit/drivers/modules/test_agent_client.py b/ironic/tests/unit/drivers/modules/test_agent_client.py index e5463e7c11..0823d75a81 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_client.py +++ b/ironic/tests/unit/drivers/modules/test_agent_client.py @@ -237,6 +237,51 @@ class TestAgentClient(base.TestCase): timeout=60, verify=True) + def test__command_error_code_agent_busy(self): + # Victoria and previous busy status check. + response_text = {"faultstring": "Agent is busy - meowing"} + self.client.session.post.return_value = MockResponse( + response_text, status_code=http_client.BAD_REQUEST) + method = 'standby.run_image' + image_info = {'image_id': 'test_image'} + params = {'image_info': image_info} + + url = self.client._get_command_url(self.node) + body = self.client._get_command_body(method, params) + + self.assertRaises(exception.AgentInProgress, + self.client._command, + self.node, method, params) + self.client.session.post.assert_called_once_with( + url, + data=body, + params={'wait': 'false'}, + timeout=60, + verify=True) + + def test__command_error_code_agent_busy_conflict(self): + # Post Wallaby logic as the IPA return code changed to + # better delineate the state. + response_text = {"faultstring": "lolcat says busy meowing"} + self.client.session.post.return_value = MockResponse( + response_text, status_code=http_client.CONFLICT) + method = 'standby.run_image' + image_info = {'image_id': 'test_image'} + params = {'image_info': image_info} + + url = self.client._get_command_url(self.node) + body = self.client._get_command_body(method, params) + + self.assertRaises(exception.AgentInProgress, + self.client._command, + self.node, method, params) + self.client.session.post.assert_called_once_with( + url, + data=body, + params={'wait': 'false'}, + timeout=60, + verify=True) + @mock.patch('os.path.exists', autospec=True, return_value=True) def test__command_verify(self, mock_exists): response_data = {'status': 'ok'} diff --git a/releasenotes/notes/fix-busy-agent-check-3cf75242b4783009.yaml b/releasenotes/notes/fix-busy-agent-check-3cf75242b4783009.yaml new file mode 100644 index 0000000000..df490b1788 --- /dev/null +++ b/releasenotes/notes/fix-busy-agent-check-3cf75242b4783009.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + Fixes recognition of a busy agent to also handle recognition + during deployment steps by more uniformly detecting and + identifying when the ``ironic-python-agent`` service is busy.