Do not tear down node upon cleaning failure
In case of a failure during cleaning, ironic currently shuts the node off. This is dangerous, e.g. when the cleaning step is a firmware upgrade. This patch proposes to corect this behaviour and leave the node on in case cleaning raises an exception. Task: #30357 Story: #2005375 Change-Id: I5fe8b380c890eb9b9dcee33868ceda2a9bab9929
This commit is contained in:
parent
46533ebea2
commit
f5a676a608
@ -385,6 +385,8 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
|
||||
set_fail_state=True):
|
||||
"""Put a failed node in CLEANFAIL and maintenance."""
|
||||
node = task.node
|
||||
node.fault = faults.CLEAN_FAILURE
|
||||
node.maintenance = True
|
||||
|
||||
if tear_down_cleaning:
|
||||
try:
|
||||
@ -410,9 +412,7 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
|
||||
# for automated cleaning, it is AVAILABLE.
|
||||
manual_clean = node.target_provision_state == states.MANAGEABLE
|
||||
node.last_error = msg
|
||||
node.maintenance = True
|
||||
node.maintenance_reason = msg
|
||||
node.fault = faults.CLEAN_FAILURE
|
||||
node.save()
|
||||
|
||||
if set_fail_state and node.provision_state != states.CLEANFAIL:
|
||||
|
@ -32,6 +32,7 @@ from oslo_utils import strutils
|
||||
import six
|
||||
|
||||
from ironic.common import exception
|
||||
from ironic.common import faults
|
||||
from ironic.common.glance_service import service_utils
|
||||
from ironic.common.i18n import _
|
||||
from ironic.common import image_service
|
||||
@ -935,10 +936,11 @@ def tear_down_inband_cleaning(task, manage_boot=True):
|
||||
"""Tears down the environment setup for in-band cleaning.
|
||||
|
||||
This method does the following:
|
||||
1. Powers off the bare metal node.
|
||||
2. If 'manage_boot' parameter is set to true, it also
|
||||
calls the 'clean_up_ramdisk' method of boot interface to clean up
|
||||
the environment that was set for booting agent ramdisk.
|
||||
1. Powers off the bare metal node (unless the node is fast
|
||||
tracked or there was a cleaning failure).
|
||||
2. If 'manage_boot' parameter is set to true, it also calls
|
||||
the 'clean_up_ramdisk' method of boot interface to clean
|
||||
up the environment that was set for booting agent ramdisk.
|
||||
3. Deletes the cleaning ports which were setup as part
|
||||
of cleaning.
|
||||
|
||||
@ -950,7 +952,11 @@ def tear_down_inband_cleaning(task, manage_boot=True):
|
||||
removed.
|
||||
"""
|
||||
fast_track = manager_utils.is_fast_track(task)
|
||||
if not fast_track:
|
||||
|
||||
node = task.node
|
||||
cleaning_failure = (node.fault == faults.CLEAN_FAILURE)
|
||||
|
||||
if not (fast_track or cleaning_failure):
|
||||
manager_utils.node_power_action(task, states.POWER_OFF)
|
||||
|
||||
if manage_boot:
|
||||
@ -958,7 +964,7 @@ def tear_down_inband_cleaning(task, manage_boot=True):
|
||||
|
||||
power_state_to_restore = manager_utils.power_on_node_if_needed(task)
|
||||
task.driver.network.remove_cleaning_network(task)
|
||||
if not fast_track:
|
||||
if not (fast_track or cleaning_failure):
|
||||
manager_utils.restore_power_state_if_needed(
|
||||
task, power_state_to_restore)
|
||||
|
||||
|
@ -31,6 +31,7 @@ from testtools import matchers
|
||||
|
||||
from ironic.common import boot_devices
|
||||
from ironic.common import exception
|
||||
from ironic.common import faults
|
||||
from ironic.common import image_service
|
||||
from ironic.common import states
|
||||
from ironic.common import utils as common_utils
|
||||
@ -1750,12 +1751,14 @@ class AgentMethodsTestCase(db_base.DbTestCase):
|
||||
def _test_tear_down_inband_cleaning(
|
||||
self, power_mock, remove_cleaning_network_mock,
|
||||
clean_up_ramdisk_mock, is_fast_track_mock,
|
||||
manage_boot=True, fast_track=False):
|
||||
manage_boot=True, fast_track=False, cleaning_error=False):
|
||||
is_fast_track_mock.return_value = fast_track
|
||||
with task_manager.acquire(
|
||||
self.context, self.node.uuid, shared=False) as task:
|
||||
if cleaning_error:
|
||||
task.node.fault = faults.CLEAN_FAILURE
|
||||
utils.tear_down_inband_cleaning(task, manage_boot=manage_boot)
|
||||
if not fast_track:
|
||||
if not (fast_track or cleaning_error):
|
||||
power_mock.assert_called_once_with(task, states.POWER_OFF)
|
||||
else:
|
||||
self.assertFalse(power_mock.called)
|
||||
@ -1775,6 +1778,9 @@ class AgentMethodsTestCase(db_base.DbTestCase):
|
||||
def test_tear_down_inband_cleaning_fast_track(self):
|
||||
self._test_tear_down_inband_cleaning(fast_track=True)
|
||||
|
||||
def test_tear_down_inband_cleaning_cleaning_error(self):
|
||||
self._test_tear_down_inband_cleaning(cleaning_error=True)
|
||||
|
||||
def test_build_agent_options_conf(self):
|
||||
self.config(api_url='https://api-url', group='conductor')
|
||||
options = utils.build_agent_options(self.node)
|
||||
|
@ -0,0 +1,11 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Fixes a bug where ironic would shut a node down upon cleaning failure.
|
||||
Now, the node stays powered on (as documented and intended).
|
||||
upgrade:
|
||||
- |
|
||||
When a failure occurs during cleaning, nodes will no longer be shut down. The
|
||||
behaviour was changed to prevent harm and allow for an admin intervention
|
||||
when sensitive operations, such as firmware upgrades, are performed and fail
|
||||
during cleaning.
|
Loading…
Reference in New Issue
Block a user