From c9a5d1b153cc707279755bcd64bcb841e7784ec8 Mon Sep 17 00:00:00 2001 From: Pavlo Shchelokovskyy Date: Tue, 29 Nov 2016 13:54:27 +0200 Subject: [PATCH] Move heartbeat processing to separate mixin class Ironic already exposes heartbeat endpoint in top-level, driver-independent REST API. As an Ironic API heartbeater can be implemented as much simpler script/startup job than whole Ironic Python Agent inside the deploy ramdisk, let us allow out-of-tree drivers which are not IPA-based to re-use base ironic heartbeat processing implementation as is. This patch moves the 'heartbeat' method to a separate HeartbeatMixin class, and also provides dummy implementations of methods called from within the 'heartbeat' method for reference. Drivers that do not depend on IPA but still would like to use heartbeat functionality may inherit from this class and override self._client in their __init__ to None or False to suppress collecting logs from ramdisk on failure. As a side-effect of such change, method `_refresh_clean_steps` is turned to public one (leading underscore removed). This change should be completely trasparent to current consumers of ironic driver API and has no effect on them. Change-Id: Ia7987342aef1910c5f4042604ab5076bf4e035e9 --- ironic/drivers/modules/agent_base_vendor.py | 183 ++++++++++-------- .../drivers/modules/test_agent_base_vendor.py | 60 +++--- 2 files changed, 135 insertions(+), 108 deletions(-) diff --git a/ironic/drivers/modules/agent_base_vendor.py b/ironic/drivers/modules/agent_base_vendor.py index 33acd031f0..13cc94b8b4 100644 --- a/ironic/drivers/modules/agent_base_vendor.py +++ b/ironic/drivers/modules/agent_base_vendor.py @@ -206,8 +206,8 @@ def log_and_raise_deployment_error(task, msg): raise exception.InstanceDeployFailure(msg) -class AgentDeployMixin(object): - """Mixin with deploy methods.""" +class HeartbeatMixin(object): + """Mixin class implementing heartbeat processing.""" def __init__(self): self._client = _get_client() @@ -240,8 +240,104 @@ class AgentDeployMixin(object): """ - @METRICS.timer('AgentDeployMixin._refresh_clean_steps') - def _refresh_clean_steps(self, task): + def refresh_clean_steps(self, task): + """Refresh the node's cached clean steps + + :param task: a TaskManager instance + + """ + + def continue_cleaning(self, task): + """Start the next cleaning step if the previous one is complete. + + :param task: a TaskManager instance + + """ + + @METRICS.timer('HeartbeatMixin.heartbeat') + def heartbeat(self, task, callback_url): + """Process a heartbeat. + + :param task: task to work with. + :param callback_url: agent HTTP API URL. + """ + # TODO(dtantsur): upgrade lock only if we actually take action other + # than updating the last timestamp. + task.upgrade_lock() + + node = task.node + LOG.debug('Heartbeat from node %s', node.uuid) + + driver_internal_info = node.driver_internal_info + driver_internal_info['agent_url'] = callback_url + + # TODO(rloo): 'agent_last_heartbeat' was deprecated since it wasn't + # being used so remove that entry if it exists. + # Hopefully all nodes will have been updated by Pike, so + # we can delete this code then. + driver_internal_info.pop('agent_last_heartbeat', None) + + node.driver_internal_info = driver_internal_info + node.save() + + # Async call backs don't set error state on their own + # TODO(jimrollenhagen) improve error messages here + msg = _('Failed checking if deploy is done.') + try: + if node.maintenance: + # this shouldn't happen often, but skip the rest if it does. + LOG.debug('Heartbeat from node %(node)s in maintenance mode; ' + 'not taking any action.', {'node': node.uuid}) + return + elif (node.provision_state == states.DEPLOYWAIT and + not self.deploy_has_started(task)): + msg = _('Node failed to deploy.') + self.continue_deploy(task) + elif (node.provision_state == states.DEPLOYWAIT and + self.deploy_is_done(task)): + msg = _('Node failed to move to active state.') + self.reboot_to_instance(task) + elif (node.provision_state == states.DEPLOYWAIT and + self.deploy_has_started(task)): + node.touch_provisioning() + elif node.provision_state == states.CLEANWAIT: + node.touch_provisioning() + try: + if not node.clean_step: + LOG.debug('Node %s just booted to start cleaning.', + node.uuid) + msg = _('Node failed to start the first cleaning ' + 'step.') + # First, cache the clean steps + self.refresh_clean_steps(task) + # Then set/verify node clean steps and start cleaning + manager_utils.set_node_cleaning_steps(task) + _notify_conductor_resume_clean(task) + else: + msg = _('Node failed to check cleaning progress.') + self.continue_cleaning(task) + except exception.NoFreeConductorWorker: + # waiting for the next heartbeat, node.last_error and + # logging message is filled already via conductor's hook + pass + + except Exception as e: + err_info = {'node': node.uuid, 'msg': msg, 'e': e} + last_error = _('Asynchronous exception for node %(node)s: ' + '%(msg)s Exception: %(e)s') % err_info + LOG.exception(last_error) + if node.provision_state in (states.CLEANING, states.CLEANWAIT): + manager_utils.cleaning_error_handler(task, last_error) + elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT): + deploy_utils.set_failed_state( + task, last_error, collect_logs=bool(self._client)) + + +class AgentDeployMixin(HeartbeatMixin): + """Mixin with deploy methods.""" + + @METRICS.timer('AgentDeployMixin.refresh_clean_steps') + def refresh_clean_steps(self, task): """Refresh the node's cached clean steps from the booted agent. Gets the node's clean steps from the booted agent and caches them. @@ -353,7 +449,7 @@ class AgentDeployMixin(object): elif command.get('command_status') == 'CLEAN_VERSION_MISMATCH': # Cache the new clean steps (and 'hardware_manager_version') try: - self._refresh_clean_steps(task) + self.refresh_clean_steps(task) except exception.NodeCleaningFailure as e: msg = (_('Could not continue cleaning on node ' '%(node)s: %(err)s.') % @@ -431,83 +527,6 @@ class AgentDeployMixin(object): LOG.error(msg) return manager_utils.cleaning_error_handler(task, msg) - @METRICS.timer('AgentDeployMixin.heartbeat') - def heartbeat(self, task, callback_url): - """Process a heartbeat. - - :param task: task to work with. - :param callback_url: agent HTTP API URL. - """ - # TODO(dtantsur): upgrade lock only if we actually take action other - # than updating the last timestamp. - task.upgrade_lock() - - node = task.node - LOG.debug('Heartbeat from node %s', node.uuid) - - driver_internal_info = node.driver_internal_info - driver_internal_info['agent_url'] = callback_url - - # TODO(rloo): 'agent_last_heartbeat' was deprecated since it wasn't - # being used so remove that entry if it exists. - # Hopefully all nodes will have been updated by Pike, so - # we can delete this code then. - driver_internal_info.pop('agent_last_heartbeat', None) - - node.driver_internal_info = driver_internal_info - node.save() - - # Async call backs don't set error state on their own - # TODO(jimrollenhagen) improve error messages here - msg = _('Failed checking if deploy is done.') - try: - if node.maintenance: - # this shouldn't happen often, but skip the rest if it does. - LOG.debug('Heartbeat from node %(node)s in maintenance mode; ' - 'not taking any action.', {'node': node.uuid}) - return - elif (node.provision_state == states.DEPLOYWAIT and - not self.deploy_has_started(task)): - msg = _('Node failed to deploy.') - self.continue_deploy(task) - elif (node.provision_state == states.DEPLOYWAIT and - self.deploy_is_done(task)): - msg = _('Node failed to move to active state.') - self.reboot_to_instance(task) - elif (node.provision_state == states.DEPLOYWAIT and - self.deploy_has_started(task)): - node.touch_provisioning() - elif node.provision_state == states.CLEANWAIT: - node.touch_provisioning() - try: - if not node.clean_step: - LOG.debug('Node %s just booted to start cleaning.', - node.uuid) - msg = _('Node failed to start the first cleaning ' - 'step.') - # First, cache the clean steps - self._refresh_clean_steps(task) - # Then set/verify node clean steps and start cleaning - manager_utils.set_node_cleaning_steps(task) - _notify_conductor_resume_clean(task) - else: - msg = _('Node failed to check cleaning progress.') - self.continue_cleaning(task) - except exception.NoFreeConductorWorker: - # waiting for the next heartbeat, node.last_error and - # logging message is filled already via conductor's hook - pass - - except Exception as e: - err_info = {'node': node.uuid, 'msg': msg, 'e': e} - last_error = _('Asynchronous exception for node %(node)s: ' - '%(msg)s Exception: %(e)s') % err_info - LOG.exception(last_error) - if node.provision_state in (states.CLEANING, states.CLEANWAIT): - manager_utils.cleaning_error_handler(task, last_error) - elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT): - deploy_utils.set_failed_state(task, last_error) - @METRICS.timer('AgentDeployMixin.reboot_and_finish_deploy') def reboot_and_finish_deploy(self, task): """Helper method to trigger reboot on the node and finish deploy. diff --git a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py index a52c3c1620..bb016190b1 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py +++ b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py @@ -60,12 +60,16 @@ class AgentDeployMixinBaseTest(db_base.DbTestCase): self.node = object_utils.create_test_node(self.context, **n) -class TestHeartbeat(AgentDeployMixinBaseTest): +class HeartbeatMixinTest(AgentDeployMixinBaseTest): - @mock.patch.object(agent_base_vendor.AgentDeployMixin, + def setUp(self): + super(HeartbeatMixinTest, self).setUp() + self.deploy = agent_base_vendor.HeartbeatMixin() + + @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'deploy_has_started', autospec=True) @mock.patch.object(deploy_utils, 'set_failed_state', autospec=True) - @mock.patch.object(agent_base_vendor.AgentDeployMixin, 'deploy_is_done', + @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'deploy_is_done', autospec=True) @mock.patch.object(agent_base_vendor.LOG, 'exception', autospec=True) def test_heartbeat_deploy_done_fails(self, log_mock, done_mock, @@ -77,16 +81,17 @@ class TestHeartbeat(AgentDeployMixinBaseTest): task.node.provision_state = states.DEPLOYWAIT task.node.target_provision_state = states.ACTIVE self.deploy.heartbeat(task, 'http://127.0.0.1:8080') - failed_mock.assert_called_once_with(task, mock.ANY) + failed_mock.assert_called_once_with( + task, mock.ANY, collect_logs=True) log_mock.assert_called_once_with( 'Asynchronous exception for node ' '1be26c0b-03f2-4d2e-ae87-c02d7f33c123: Failed checking if deploy ' 'is done. Exception: LlamaException') - @mock.patch.object(agent_base_vendor.AgentDeployMixin, + @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'deploy_has_started', autospec=True) @mock.patch.object(deploy_utils, 'set_failed_state', autospec=True) - @mock.patch.object(agent_base_vendor.AgentDeployMixin, 'deploy_is_done', + @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'deploy_is_done', autospec=True) @mock.patch.object(agent_base_vendor.LOG, 'exception', autospec=True) def test_heartbeat_deploy_done_raises_with_event(self, log_mock, done_mock, @@ -116,8 +121,8 @@ class TestHeartbeat(AgentDeployMixinBaseTest): 'is done. Exception: LlamaException') @mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True) - @mock.patch.object(agent_base_vendor.AgentDeployMixin, - '_refresh_clean_steps', autospec=True) + @mock.patch.object(agent_base_vendor.HeartbeatMixin, + 'refresh_clean_steps', autospec=True) @mock.patch.object(manager_utils, 'set_node_cleaning_steps', autospec=True) @mock.patch.object(agent_base_vendor, '_notify_conductor_resume_clean', autospec=True) @@ -137,8 +142,8 @@ class TestHeartbeat(AgentDeployMixinBaseTest): @mock.patch.object(manager_utils, 'cleaning_error_handler') @mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True) - @mock.patch.object(agent_base_vendor.AgentDeployMixin, - '_refresh_clean_steps', autospec=True) + @mock.patch.object(agent_base_vendor.HeartbeatMixin, + 'refresh_clean_steps', autospec=True) @mock.patch.object(manager_utils, 'set_node_cleaning_steps', autospec=True) @mock.patch.object(agent_base_vendor, '_notify_conductor_resume_clean', autospec=True) @@ -171,7 +176,7 @@ class TestHeartbeat(AgentDeployMixinBaseTest): failed_mock.side_effect = None @mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True) - @mock.patch.object(agent_base_vendor.AgentDeployMixin, + @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_cleaning', autospec=True) def test_heartbeat_continue_cleaning(self, mock_continue, mock_touch): self.node.clean_step = { @@ -190,7 +195,7 @@ class TestHeartbeat(AgentDeployMixinBaseTest): mock_continue.assert_called_once_with(mock.ANY, task) @mock.patch.object(manager_utils, 'cleaning_error_handler') - @mock.patch.object(agent_base_vendor.AgentDeployMixin, + @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_cleaning', autospec=True) def test_heartbeat_continue_cleaning_fails(self, mock_continue, mock_handler): @@ -213,7 +218,7 @@ class TestHeartbeat(AgentDeployMixinBaseTest): mock_handler.assert_called_once_with(task, mock.ANY) @mock.patch.object(manager_utils, 'cleaning_error_handler') - @mock.patch.object(agent_base_vendor.AgentDeployMixin, + @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_cleaning', autospec=True) def test_heartbeat_continue_cleaning_no_worker(self, mock_continue, mock_handler): @@ -235,9 +240,9 @@ class TestHeartbeat(AgentDeployMixinBaseTest): mock_continue.assert_called_once_with(mock.ANY, task) self.assertFalse(mock_handler.called) - @mock.patch.object(agent_base_vendor.AgentDeployMixin, 'continue_deploy', + @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy', autospec=True) - @mock.patch.object(agent_base_vendor.AgentDeployMixin, + @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'reboot_to_instance', autospec=True) @mock.patch.object(agent_base_vendor, '_notify_conductor_resume_clean', autospec=True) @@ -260,9 +265,9 @@ class TestHeartbeat(AgentDeployMixinBaseTest): node.refresh() self.assertNotIn('agent_last_heartbeat', node.driver_internal_info) - @mock.patch.object(agent_base_vendor.AgentDeployMixin, 'continue_deploy', + @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy', autospec=True) - @mock.patch.object(agent_base_vendor.AgentDeployMixin, + @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'reboot_to_instance', autospec=True) @mock.patch.object(agent_base_vendor, '_notify_conductor_resume_clean', autospec=True) @@ -283,7 +288,7 @@ class TestHeartbeat(AgentDeployMixinBaseTest): self.assertEqual(0, cd_mock.call_count) @mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True) - @mock.patch.object(agent_base_vendor.AgentDeployMixin, + @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'deploy_has_started', autospec=True) def test_heartbeat_touch_provisioning(self, mock_deploy_started, mock_touch): @@ -297,6 +302,9 @@ class TestHeartbeat(AgentDeployMixinBaseTest): mock_touch.assert_called_once_with(mock.ANY) + +class AgentDeployMixinTest(AgentDeployMixinBaseTest): + @mock.patch.object(driver_utils, 'collect_ramdisk_logs', autospec=True) @mock.patch.object(time, 'sleep', lambda seconds: None) @mock.patch.object(manager_utils, 'node_power_action', autospec=True) @@ -968,7 +976,7 @@ class TestHeartbeat(AgentDeployMixinBaseTest): @mock.patch.object(agent_base_vendor, '_notify_conductor_resume_clean', autospec=True) @mock.patch.object(agent_base_vendor.AgentDeployMixin, - '_refresh_clean_steps', autospec=True) + 'refresh_clean_steps', autospec=True) @mock.patch.object(agent_client.AgentClient, 'get_commands_status', autospec=True) def _test_continue_cleaning_clean_version_mismatch( @@ -1007,7 +1015,7 @@ class TestHeartbeat(AgentDeployMixinBaseTest): @mock.patch.object(agent_base_vendor, '_notify_conductor_resume_clean', autospec=True) @mock.patch.object(agent_base_vendor.AgentDeployMixin, - '_refresh_clean_steps', autospec=True) + 'refresh_clean_steps', autospec=True) @mock.patch.object(agent_client.AgentClient, 'get_commands_status', autospec=True) def test_continue_cleaning_clean_version_mismatch_fail( @@ -1157,13 +1165,13 @@ class TestRefreshCleanSteps(AgentDeployMixinBaseTest): @mock.patch.object(agent_client.AgentClient, 'get_clean_steps', autospec=True) - def test__refresh_clean_steps(self, client_mock): + def test_refresh_clean_steps(self, client_mock): client_mock.return_value = { 'command_result': self.clean_steps} with task_manager.acquire( self.context, self.node.uuid, shared=False) as task: - self.deploy._refresh_clean_steps(task) + self.deploy.refresh_clean_steps(task) client_mock.assert_called_once_with(mock.ANY, task.node, task.ports) @@ -1184,7 +1192,7 @@ class TestRefreshCleanSteps(AgentDeployMixinBaseTest): @mock.patch.object(agent_client.AgentClient, 'get_clean_steps', autospec=True) - def test__refresh_clean_steps_missing_steps(self, client_mock): + def test_refresh_clean_steps_missing_steps(self, client_mock): del self.clean_steps['clean_steps'] client_mock.return_value = { 'command_result': self.clean_steps} @@ -1193,14 +1201,14 @@ class TestRefreshCleanSteps(AgentDeployMixinBaseTest): self.context, self.node.uuid, shared=False) as task: self.assertRaisesRegex(exception.NodeCleaningFailure, 'invalid result', - self.deploy._refresh_clean_steps, + self.deploy.refresh_clean_steps, task) client_mock.assert_called_once_with(mock.ANY, task.node, task.ports) @mock.patch.object(agent_client.AgentClient, 'get_clean_steps', autospec=True) - def test__refresh_clean_steps_missing_interface(self, client_mock): + def test_refresh_clean_steps_missing_interface(self, client_mock): step = self.clean_steps['clean_steps']['SpecificHardwareManager'][1] del step['interface'] client_mock.return_value = { @@ -1210,7 +1218,7 @@ class TestRefreshCleanSteps(AgentDeployMixinBaseTest): self.context, self.node.uuid, shared=False) as task: self.assertRaisesRegex(exception.NodeCleaningFailure, 'invalid clean step', - self.deploy._refresh_clean_steps, + self.deploy.refresh_clean_steps, task) client_mock.assert_called_once_with(mock.ANY, task.node, task.ports)