Add ablity to power off nodes in clean failed

We have seen duplicate ip issues when leaving clean failed nodes
powered on. This patch allows operators to power down nodes that
enter clean failed state.

Change-Id: Iecb402227485fe0ba787a262121c9d6a048b0e13
This commit is contained in:
Chris Krelle 2023-04-12 08:58:35 -07:00
parent 8ef9db1570
commit 510a612eed
4 changed files with 51 additions and 0 deletions

View File

@ -499,6 +499,11 @@ def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False,
# NOTE(dtantsur): avoid overwriting existing maintenance_reason # NOTE(dtantsur): avoid overwriting existing maintenance_reason
if not node.maintenance_reason and set_maintenance: if not node.maintenance_reason and set_maintenance:
node.maintenance_reason = errmsg node.maintenance_reason = errmsg
if CONF.conductor.poweroff_in_cleanfail:
# NOTE(NobodyCam): Power off node in clean fail
node_power_action(task, states.POWER_OFF)
node.save() node.save()
if set_fail_state and node.provision_state != states.CLEANFAIL: if set_fail_state and node.provision_state != states.CLEANFAIL:

View File

@ -349,6 +349,14 @@ opts = [
'is a global setting applying to all requests this ' 'is a global setting applying to all requests this '
'conductor receives, regardless of access rights. ' 'conductor receives, regardless of access rights. '
'The concurrent clean limit cannot be disabled.')), 'The concurrent clean limit cannot be disabled.')),
cfg.BoolOpt('poweroff_in_cleanfail',
default=False,
help=_('If True power off nodes in the ``clean failed`` '
'state. Default False. Option may be unsafe '
'when using Cleaning to perform '
'hardware-transformative actions such as '
'firmware upgrade.')),
] ]

View File

@ -436,6 +436,36 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
self.assertFalse(node.maintenance) self.assertFalse(node.maintenance)
self.assertIsNone(node.fault) self.assertIsNone(node.fault)
@mock.patch('ironic.drivers.modules.fake.FakePower.set_power_state',
autospec=True)
@mock.patch.object(n_flat.FlatNetwork, 'validate', autospec=True)
@mock.patch.object(conductor_steps, 'set_node_cleaning_steps',
autospec=True)
def test_do_node_clean_steps_fail_poweroff(self, mock_steps, mock_validate,
mock_power, clean_steps=None,
invalid_exc=True):
if invalid_exc:
mock_steps.side_effect = exception.InvalidParameterValue('invalid')
else:
mock_steps.side_effect = exception.NodeCleaningFailure('failure')
tgt_prov_state = states.MANAGEABLE if clean_steps else states.AVAILABLE
self.config(poweroff_in_cleanfail=True, group='conductor')
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
uuid=uuidutils.generate_uuid(),
provision_state=states.CLEANING,
power_state=states.POWER_ON,
target_provision_state=tgt_prov_state)
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
cleaning.do_node_clean(task, clean_steps=clean_steps)
mock_validate.assert_called_once_with(mock.ANY, task)
node.refresh()
self.assertEqual(states.CLEANFAIL, node.provision_state)
self.assertEqual(tgt_prov_state, node.target_provision_state)
mock_steps.assert_called_once_with(mock.ANY, disable_ramdisk=False)
self.assertTrue(mock_power.called)
def test__do_node_clean_automated_steps_fail(self): def test__do_node_clean_automated_steps_fail(self):
for invalid in (True, False): for invalid in (True, False):
self.__do_node_clean_steps_fail(invalid_exc=invalid) self.__do_node_clean_steps_fail(invalid_exc=invalid)

View File

@ -0,0 +1,8 @@
---
features:
- |
Add new conductor conf option: [conductor]poweroff_in_cleanfail
(default: False). when True nodes entering clean failed state
will be powered off. This option may be unsafe when using
Cleaning to perform hardware-transformative actions such as
firmware upgrade.