Do not tear down node upon cleaning failure

In case of a failure during cleaning, ironic currently shuts the
node off. This is dangerous, e.g. when the cleaning step is a
firmware upgrade. This patch proposes to corect this behaviour
and leave the node on in case cleaning raises an exception.

Task: #30357
Story: #2005375
Change-Id: I5fe8b380c890eb9b9dcee33868ceda2a9bab9929
This commit is contained in:
Arne Wiebalck 2019-07-19 15:07:29 +02:00
parent 46533ebea2
commit f5a676a608
4 changed files with 33 additions and 10 deletions

View File

@ -385,6 +385,8 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
set_fail_state=True):
"""Put a failed node in CLEANFAIL and maintenance."""
node = task.node
node.fault = faults.CLEAN_FAILURE
node.maintenance = True
if tear_down_cleaning:
try:
@ -410,9 +412,7 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
# for automated cleaning, it is AVAILABLE.
manual_clean = node.target_provision_state == states.MANAGEABLE
node.last_error = msg
node.maintenance = True
node.maintenance_reason = msg
node.fault = faults.CLEAN_FAILURE
node.save()
if set_fail_state and node.provision_state != states.CLEANFAIL:

View File

@ -32,6 +32,7 @@ from oslo_utils import strutils
import six
from ironic.common import exception
from ironic.common import faults
from ironic.common.glance_service import service_utils
from ironic.common.i18n import _
from ironic.common import image_service
@ -935,10 +936,11 @@ def tear_down_inband_cleaning(task, manage_boot=True):
"""Tears down the environment setup for in-band cleaning.
This method does the following:
1. Powers off the bare metal node.
2. If 'manage_boot' parameter is set to true, it also
calls the 'clean_up_ramdisk' method of boot interface to clean up
the environment that was set for booting agent ramdisk.
1. Powers off the bare metal node (unless the node is fast
tracked or there was a cleaning failure).
2. If 'manage_boot' parameter is set to true, it also calls
the 'clean_up_ramdisk' method of boot interface to clean
up the environment that was set for booting agent ramdisk.
3. Deletes the cleaning ports which were setup as part
of cleaning.
@ -950,7 +952,11 @@ def tear_down_inband_cleaning(task, manage_boot=True):
removed.
"""
fast_track = manager_utils.is_fast_track(task)
if not fast_track:
node = task.node
cleaning_failure = (node.fault == faults.CLEAN_FAILURE)
if not (fast_track or cleaning_failure):
manager_utils.node_power_action(task, states.POWER_OFF)
if manage_boot:
@ -958,7 +964,7 @@ def tear_down_inband_cleaning(task, manage_boot=True):
power_state_to_restore = manager_utils.power_on_node_if_needed(task)
task.driver.network.remove_cleaning_network(task)
if not fast_track:
if not (fast_track or cleaning_failure):
manager_utils.restore_power_state_if_needed(
task, power_state_to_restore)

View File

@ -31,6 +31,7 @@ from testtools import matchers
from ironic.common import boot_devices
from ironic.common import exception
from ironic.common import faults
from ironic.common import image_service
from ironic.common import states
from ironic.common import utils as common_utils
@ -1750,12 +1751,14 @@ class AgentMethodsTestCase(db_base.DbTestCase):
def _test_tear_down_inband_cleaning(
self, power_mock, remove_cleaning_network_mock,
clean_up_ramdisk_mock, is_fast_track_mock,
manage_boot=True, fast_track=False):
manage_boot=True, fast_track=False, cleaning_error=False):
is_fast_track_mock.return_value = fast_track
with task_manager.acquire(
self.context, self.node.uuid, shared=False) as task:
if cleaning_error:
task.node.fault = faults.CLEAN_FAILURE
utils.tear_down_inband_cleaning(task, manage_boot=manage_boot)
if not fast_track:
if not (fast_track or cleaning_error):
power_mock.assert_called_once_with(task, states.POWER_OFF)
else:
self.assertFalse(power_mock.called)
@ -1775,6 +1778,9 @@ class AgentMethodsTestCase(db_base.DbTestCase):
def test_tear_down_inband_cleaning_fast_track(self):
self._test_tear_down_inband_cleaning(fast_track=True)
def test_tear_down_inband_cleaning_cleaning_error(self):
self._test_tear_down_inband_cleaning(cleaning_error=True)
def test_build_agent_options_conf(self):
self.config(api_url='https://api-url', group='conductor')
options = utils.build_agent_options(self.node)

View File

@ -0,0 +1,11 @@
---
fixes:
- |
Fixes a bug where ironic would shut a node down upon cleaning failure.
Now, the node stays powered on (as documented and intended).
upgrade:
- |
When a failure occurs during cleaning, nodes will no longer be shut down. The
behaviour was changed to prevent harm and allow for an admin intervention
when sensitive operations, such as firmware upgrades, are performed and fail
during cleaning.