From 6c96257142c18adb9f070617022a6d6c1414ae1f Mon Sep 17 00:00:00 2001 From: Jay Faulkner Date: Wed, 23 Mar 2016 16:00:37 +0000 Subject: [PATCH] Config to stop powering off nodes on failure This adds the config option deploy.power_off_after_deploy_failure, which will prevent powering off nodes once a deployment fails. Powering off nodes means loss of the log output from the agent. Choosing to leave the machine powered on would give an operator the opportunity to troubleshoot the issue using agent logs on the system in case of failure, or to prevent needing to power the machine back on for provisioning after cleaning. As a note, Nova will still attempt to delete the node on a failed deployment, so the option will not be very effective in non-standalone deployments until a similar patch is put into nova. Related-bug: 1531217 Change-Id: I7c25946de50ffda2869307cb4016e0ccfe81d192 Co-Authored-By: Josh Gachnang --- etc/ironic/ironic.conf.sample | 4 +++ ironic/drivers/modules/deploy_utils.py | 22 +++++++++------- .../unit/drivers/modules/test_deploy_utils.py | 26 ++++++++++++++++--- ...-poweroff-on-failure-86e43b3e39043990.yaml | 12 +++++++++ 4 files changed, 52 insertions(+), 12 deletions(-) create mode 100644 releasenotes/notes/add-support-for-no-poweroff-on-failure-86e43b3e39043990.yaml diff --git a/etc/ironic/ironic.conf.sample b/etc/ironic/ironic.conf.sample index cc651034df..7a53e1af54 100644 --- a/etc/ironic/ironic.conf.sample +++ b/etc/ironic/ironic.conf.sample @@ -919,6 +919,10 @@ # Deprecated group/name - [agent]/agent_erase_devices_iterations #erase_devices_iterations=1 +# Whether to power off a node after deploy failure. Defaults +# to True. (boolean value) +#power_off_after_deploy_failure=true + [dhcp] diff --git a/ironic/drivers/modules/deploy_utils.py b/ironic/drivers/modules/deploy_utils.py index 7b615a9b84..95e59cbc5f 100644 --- a/ironic/drivers/modules/deploy_utils.py +++ b/ironic/drivers/modules/deploy_utils.py @@ -71,6 +71,10 @@ deploy_opts = [ deprecated_group='agent', default=1, help=_('Number of iterations to be run for erasing devices.')), + cfg.BoolOpt('power_off_after_deploy_failure', + default=True, + help=_('Whether to power off a node after deploy failure. ' + 'Defaults to True.')), ] CONF = cfg.CONF CONF.register_opts(deploy_opts, group='deploy') @@ -492,15 +496,15 @@ def set_failed_state(task, msg): % {'node': node.uuid, 'state': node.provision_state}) LOG.exception(msg2) - try: - manager_utils.node_power_action(task, states.POWER_OFF) - except Exception: - msg2 = (_LE('Node %s failed to power off while handling deploy ' - 'failure. This may be a serious condition. Node ' - 'should be removed from Ironic or put in maintenance ' - 'mode until the problem is resolved.') % node.uuid) - LOG.exception(msg2) - + if CONF.deploy.power_off_after_deploy_failure: + try: + manager_utils.node_power_action(task, states.POWER_OFF) + except Exception: + msg2 = (_LE('Node %s failed to power off while handling deploy ' + 'failure. This may be a serious condition. Node ' + 'should be removed from Ironic or put in maintenance ' + 'mode until the problem is resolved.') % node.uuid) + LOG.exception(msg2) # NOTE(deva): node_power_action() erases node.last_error # so we need to set it here. node.last_error = msg diff --git a/ironic/tests/unit/drivers/modules/test_deploy_utils.py b/ironic/tests/unit/drivers/modules/test_deploy_utils.py index 8a916b11ac..8362c07bbf 100644 --- a/ironic/tests/unit/drivers/modules/test_deploy_utils.py +++ b/ironic/tests/unit/drivers/modules/test_deploy_utils.py @@ -1252,7 +1252,7 @@ class OtherFunctionTestCase(db_base.DbTestCase): autospec=True) def _test_set_failed_state(self, mock_event, mock_power, mock_log, event_value=None, power_value=None, - log_calls=None): + log_calls=None, poweroff=True): err_msg = 'some failure' mock_event.side_effect = event_value mock_power.side_effect = power_value @@ -1260,9 +1260,12 @@ class OtherFunctionTestCase(db_base.DbTestCase): shared=False) as task: utils.set_failed_state(task, err_msg) mock_event.assert_called_once_with(task, 'fail') - mock_power.assert_called_once_with(task, states.POWER_OFF) + if poweroff: + mock_power.assert_called_once_with(task, states.POWER_OFF) + else: + self.assertFalse(mock_power.called) self.assertEqual(err_msg, task.node.last_error) - if log_calls: + if (log_calls and poweroff): mock_log.exception.assert_has_calls(log_calls) else: self.assertFalse(mock_log.called) @@ -1283,6 +1286,23 @@ class OtherFunctionTestCase(db_base.DbTestCase): power_value=iter([exc_param] * len(calls)), log_calls=calls) + def test_set_failed_state_no_poweroff(self): + cfg.CONF.deploy.power_off_after_deploy_failure = False + exc_state = exception.InvalidState('invalid state') + exc_param = exception.InvalidParameterValue('invalid parameter') + mock_call = mock.call(mock.ANY) + self._test_set_failed_state(poweroff=False) + calls = [mock_call] + self._test_set_failed_state(event_value=iter([exc_state] * len(calls)), + log_calls=calls, poweroff=False) + calls = [mock_call] + self._test_set_failed_state(power_value=iter([exc_param] * len(calls)), + log_calls=calls, poweroff=False) + calls = [mock_call, mock_call] + self._test_set_failed_state(event_value=iter([exc_state] * len(calls)), + power_value=iter([exc_param] * len(calls)), + log_calls=calls, poweroff=False) + def test_get_boot_option(self): self.node.instance_info = {'capabilities': '{"boot_option": "local"}'} result = utils.get_boot_option(self.node) diff --git a/releasenotes/notes/add-support-for-no-poweroff-on-failure-86e43b3e39043990.yaml b/releasenotes/notes/add-support-for-no-poweroff-on-failure-86e43b3e39043990.yaml new file mode 100644 index 0000000000..4808ad400c --- /dev/null +++ b/releasenotes/notes/add-support-for-no-poweroff-on-failure-86e43b3e39043990.yaml @@ -0,0 +1,12 @@ +--- +features: + - Operators can now set + deploy.power_off_after_deploy_failure to leave nodes + powered on when a deployment fails. This is useful + for troubleshooting deployment issues. As a note, + Nova will still attempt to delete a node after a failed + deployment, so deploy.power_off_after_deploy_failure + may not be very effective in non-standalone + deployments until a similar patch to ironic's driver in + nova is proposed. +