Implement execute clean steps

This implements executing the clean steps in the conductor. The RPC API version is bumped to allow async clean steps to call back to the conductor. Adds node.clean_step to store the current cleaning operation. The conductor will get a list of clean steps from the node's drivers, order them by priority, and then have the drivers execute the steps in order. Adds a config option to enable cleaning, defaulting to True. Related-bug: #1174153 Implements blueprint implement-cleaning-states Change-Id: I96af133c501f86a6e620c4684ee65abad2111f7b
2015-02-17 17:42:18 -08:00 · 2015-02-17 17:42:18 -08:00 · 7589d1faaf
commit 7589d1faaf
parent e807e28986
13 changed files with 787 additions and 36 deletions
--- a/etc/ironic/ironic.conf.sample
+++ b/etc/ironic/ironic.conf.sample
@ -625,6 +625,15 @@
 # unlimited. (integer value)
 #inspect_timeout=1800

+# Whether Ironic will attempt to "clean" a node when preparing
+# a node for deployments or after an instance is deleted from
+# a node. Cleaning is a configurable set of steps, such as
+# erasing disk drives, that are performed on the node to
+# ensure it is in a baseline state and ready to be deployed
+# to. See the individual driver's documentation for supported
+# cleaning steps. (boolean value)
+#clean_nodes=true
+

 [console]

--- a/ironic/api/controllers/v1/node.py
+++ b/ironic/api/controllers/v1/node.py
@ -651,7 +651,7 @@ class Node(base.APIBase):
                     provision_updated_at=time, instance_info={},
                     maintenance=False, maintenance_reason=None,
                     inspection_finished_at=None, inspection_started_at=time,
-                     console_enabled=False)
+                     console_enabled=False, clean_step='')
        # NOTE(matty_dubs): The chassis_uuid getter() is based on the
        # _chassis_uuid variable:
        sample._chassis_uuid = 'edcad704-b2da-41d5-96d9-afd580ecfa12'
--- a/ironic/conductor/manager.py
+++ b/ironic/conductor/manager.py
@ -168,17 +168,36 @@ conductor_opts = [
                   default=1800,
                   help='Timeout (seconds) for waiting for node inspection. '
                        '0 - unlimited.'),
-
+        cfg.BoolOpt('clean_nodes',
+                    default=True,
+                    help='Whether Ironic will attempt to "clean" a node '
+                         'when preparing a node for deployments or after '
+                         'an instance is deleted from a node. Cleaning is '
+                         'a configurable set of steps, such as erasing disk '
+                         'drives, that are performed on the node to ensure it '
+                         'is in a baseline state and ready to be deployed to. '
+                         'See the individual driver\'s documentation for '
+                         'supported cleaning steps.'),
 ]
 CONF = cfg.CONF
 CONF.register_opts(conductor_opts, 'conductor')

+CLEANING_INTERFACE_PRIORITY = {
+    # When two clean steps have the same priority, their order is determined
+    # by which interface is implementing the clean step. The clean step of the
+    # interface with the highest value here, will be executed first in that
+    # case.
+    'power': 3,
+    'management': 2,
+    'deploy': 1
+}
+

 class ConductorManager(periodic_task.PeriodicTasks):
    """Ironic Conductor manager main class."""

    # NOTE(rloo): This must be in sync with rpcapi.ConductorAPI's.
-    RPC_API_VERSION = '1.25'
+    RPC_API_VERSION = '1.26'

    target = messaging.Target(version=RPC_API_VERSION)

@ -798,25 +817,168 @@ class ConductorManager(periodic_task.PeriodicTasks):
                state=node.provision_state)
        self._do_node_clean(task)

+    @messaging.expected_exceptions(exception.NoFreeConductorWorker,
+                                   exception.NodeLocked,
+                                   exception.InvalidStateRequested,
+                                   exception.NodeNotFound)
+    def continue_node_clean(self, context, node_id):
+        """RPC method to continue cleaning a node.
+
+        This is useful for cleaning tasks that are async. When they complete,
+        they call back via RPC, a new worker and lock are set up, and cleaning
+        continues. This can also be used to resume cleaning on take_over.
+
+        :param context: an admin context.
+        :param node_id: the id or uuid of a node.
+        :raises: InvalidStateRequested if the node is not in CLEANING state
+        :raises: NoFreeConductorWorker when there is no free worker to start
+                 async task
+        :raises: NodeLocked if node is locked by another conductor.
+        :raises: NodeNotFound if the node no longer appears in the database
+
+        """
+        LOG.debug("RPC continue_node_clean called for node %s.", node_id)
+
+        with task_manager.acquire(context, node_id, shared=False) as task:
+            if task.node.provision_state != states.CLEANING:
+                raise exception.InvalidStateRequested(_(
+                    'Cannot continue cleaning on %(node)s, node is in '
+                    '%(state)s state, should be %(clean_state)s') %
+                    {'node': task.node.uuid,
+                     'state': task.node.provision_state,
+                     'clean_state': states.CLEANING})
+            self._spawn_worker(
+                self._do_next_clean_step,
+                task,
+                task.node.driver_internal_info.get('clean_steps', []),
+                task.node.clean_step)
+
    def _do_node_clean(self, task):
        """Internal RPC method to perform automated cleaning of a node."""
        node = task.node
-        LOG.debug('Starting cleaning for node %s' % node.uuid)
+        LOG.debug('Starting cleaning for node %s', node.uuid)
+
+        if not CONF.conductor.clean_nodes:
+            # Skip cleaning, move to AVAILABLE.
+            node.clean_step = None
+            node.save()
+
+            task.process_event('done')
+            LOG.info(_LI('Cleaning is disabled, node %s has been successfully '
+                         'moved to AVAILABLE state.'), node.uuid)
+            return
+
        try:
            # NOTE(ghe): Valid power driver values are needed to perform
            # a cleaning.
            task.driver.power.validate(task)
        except (exception.InvalidParameterValue,
                exception.MissingParameterValue) as e:
-            node.last_error = (_('Failed to validate power driver interface. '
-                                 'Can not clean instance. Error: %(msg)s') %
-                               {'msg': e})
-            task.process_event('fail')
+            msg = (_('Failed to validate power driver interface. '
+                     'Can not clean node %(node)s. Error: %(msg)s') %
+                   {'node': node.uuid, 'msg': e})
+            return cleaning_error_handler(task, msg)
+
+        # Allow the deploy driver to set up the ramdisk again (necessary for
+        # IPA cleaning/zapping)
+        try:
+            prepare_result = task.driver.deploy.prepare_cleaning(task)
+        except Exception as e:
+            msg = (_('Failed to prepare node %(node)s for cleaning: %(e)s')
+                   % {'node': node.uuid, 'e': e})
+            LOG.exception(msg)
+            return cleaning_error_handler(task, msg)
+        if prepare_result == states.CLEANING:
+            # Prepare is asynchronous, the deploy driver will need to
+            # set node.driver_internal_info['clean_steps'] and
+            # node.clean_step and then make an RPC call to
+            # continue_node_cleaning to start cleaning.
            return

-        # TODO(JoshNang) Implement
-        # Move to AVAILABLE
-        LOG.debug('Cleaning complete for node %s', node.uuid)
+        set_node_cleaning_steps(task)
+
+        self._do_next_clean_step(task,
+                                 node.driver_internal_info['clean_steps'],
+                                 node.clean_step)
+
+    def _do_next_clean_step(self, task, steps, last_step):
+        """Start executing cleaning/zapping steps from the last step (if any).
+
+        :param task: a TaskManager instance with an exclusive lock
+        :param steps: The complete list of steps that need to be executed
+            on the node
+        :param last_step: The last step that was executed. {} will start
+            from the beginning
+        """
+        node = task.node
+
+        # Trim already executed steps
+        if last_step:
+            try:
+                # Trim off last_step (now finished) and all previous steps.
+                steps = steps[steps.index(last_step) + 1:]
+            except ValueError:
+                msg = (_('Node %(node)s got an invalid last step for '
+                         '%(state)s: %(step)s.') %
+                       {'node': node.uuid, 'step': last_step,
+                        'state': node.provision_state})
+                LOG.exception(msg)
+                return cleaning_error_handler(task, msg)
+
+        LOG.debug('Executing %(state)s on node %(node)s, remaining steps: '
+                  '%(steps)s', {'node': node.uuid, 'steps': steps,
+                                'state': node.provision_state})
+        # Execute each step until we hit an async step or run out of steps
+        for step in steps:
+            # Save which step we're about to start so we can restart
+            # if necessary
+            node.clean_step = step
+            node.save()
+            interface = getattr(task.driver, step.get('interface'))
+            LOG.debug('Executing %(step)s on node %(node)s',
+                      {'step': step, 'node': node.uuid})
+            try:
+                result = interface.execute_clean_step(task, step)
+            except Exception as e:
+                msg = (_('Node %(node)s failed step %(step)s: '
+                         '%(exc)s') %
+                       {'node': node.uuid, 'exc': e, 'step': node.clean_step})
+                LOG.exception(msg)
+                cleaning_error_handler(task, msg)
+                return
+
+            # Check if the step is done or not. The step should return
+            # states.CLEANING if the step is still being executed, or
+            # None if the step is done.
+            if result == states.CLEANING:
+                # Kill this worker, the async step will make an RPC call to
+                # continue_node_clean to continue cleaning
+                LOG.debug('Waiting for node %(node)s to call continue after '
+                          'async clean step %(step)s' %
+                          {'node': node.uuid, 'step': step})
+                return
+            elif result is not None:
+                msg = (_('While executing step %(step)s on node '
+                         '%(node)s, step returned invalid value: %(val)s') %
+                       {'step': step, 'node': node.uuid, 'val': result})
+                LOG.error(msg)
+                cleaning_error_handler(task, msg)
+            LOG.info(_LI('Node %(node)s finished clean step %(step)s'),
+                     {'node': node.uuid, 'step': step})
+
+        # Clear clean_step
+        node.clean_step = None
+        driver_info = node.driver_internal_info
+        driver_info['clean_steps'] = None
+        node.driver_internal_info = driver_info
+        try:
+            task.driver.deploy.tear_down_cleaning(task)
+        except Exception as e:
+            msg = (_('Failed to tear down from cleaning for node %s')
+                   % node.uuid)
+            LOG.exception(msg)
+            return cleaning_error_handler(task, msg)
+        LOG.info(_LI('Node %s cleaning complete'), node.uuid)
        task.process_event('done')

    @messaging.expected_exceptions(exception.NoFreeConductorWorker,
@ -1886,3 +2048,57 @@ def _do_inspect_hardware(task):
                   "state %(state)s") % {'state': new_state})
        handle_failure(error)
        raise exception.HardwareInspectionFailure(error=error)
+
+
+def cleaning_error_handler(task, msg):
+    """Put a failed node in CLEANFAIL or ZAPFAIL and maintenance."""
+    # Reset clean step, msg should include current step
+    if task.node.provision_state == states.CLEANING:
+        task.node.clean_step = {}
+    task.node.last_error = msg
+    task.node.maintenance = True
+    task.node.maintenance_reason = msg
+    task.node.save()
+    task.process_event('fail')
+
+
+def _step_key(step):
+    """Sort by priority, then interface priority in event of tie.
+
+    :param step: cleaning step dict to get priority for.
+    """
+    return (step.get('priority'),
+            CLEANING_INTERFACE_PRIORITY[step.get('interface')])
+
+
+def _get_cleaning_steps(task, enabled=False):
+    """Get sorted cleaning steps for task.node
+
+    :param task: A TaskManager object
+    :param enabled: If True, returns only enabled (priority > 0) steps. If
+        False, returns all clean steps.
+    :returns: A list of clean steps dictionaries, sorted with largest priority
+        as the first item
+    """
+
+    steps = list()
+    # Iterate interfaces and get clean steps from each
+    for interface in CLEANING_INTERFACE_PRIORITY.keys():
+        driver_interface = getattr(task.driver, interface)
+        if driver_interface:
+            for step in driver_interface.get_clean_steps(task):
+                if not enabled or step['priority'] > 0:
+                    steps.append(step)
+    # Sort the steps from higher priority to lower priority
+    return sorted(steps, key=_step_key, reverse=True)
+
+
+def set_node_cleaning_steps(task):
+    """Get the list of clean steps, save them to the node."""
+    # Get the prioritized steps, store them.
+    node = task.node
+    driver_info = node.driver_internal_info
+    driver_info['clean_steps'] = _get_cleaning_steps(task, enabled=True)
+    node.driver_internal_info = driver_info
+    node.clean_step = {}
+    node.save()
--- a/ironic/conductor/rpcapi.py
+++ b/ironic/conductor/rpcapi.py
@ -68,11 +68,12 @@ class ConductorAPI(object):
    |    1.23 - Added do_provisioning_action
    |    1.24 - Added inspect_hardware method
    |    1.25 - Added destroy_port
+    |    1.26 - Added continue_node_clean

    """

    # NOTE(rloo): This must be in sync with manager.ConductorManager's.
-    RPC_API_VERSION = '1.25'
+    RPC_API_VERSION = '1.26'

    def __init__(self, topic=None):
        super(ConductorAPI, self).__init__()
@ -325,6 +326,23 @@ class ConductorAPI(object):
        return cctxt.call(context, 'do_provisioning_action',
                          node_id=node_id, action=action)

+    def continue_node_clean(self, context, node_id, topic=None):
+        """Signal to conductor service to start the next cleaning action.
+
+        :param context: request context.
+        :param node_id: node id or uuid.
+        :param topic: RPC topic. Defaults to self.topic.
+        :raises: NoFreeConductorWorker when there is no free worker to start
+                async task.
+        :raises: InvalidStateRequested if the requested action can not
+                 be performed.
+        :raises: NodeLocked if node is locked by another conductor.
+        :raises: NodeNotFound if the node no longer appears in the database
+        """
+        cctxt = self.client.prepare(topic=topic or self.topic, version='1.26')
+        return cctxt.call(context, 'continue_node_clean',
+                          node_id=node_id)
+
    def validate_driver_interfaces(self, context, node_id, topic=None):
        """Validate the `core` and `standardized` interfaces for drivers.

--- a/ironic/db/sqlalchemy/alembic/versions/4f399b21ae71_add_node_clean_step.py
+++ b/ironic/db/sqlalchemy/alembic/versions/4f399b21ae71_add_node_clean_step.py
@ -0,0 +1,35 @@
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+"""Add node.clean_step
+
+Revision ID: 4f399b21ae71
+Revises: 1e1d5ace7dc6
+Create Date: 2015-02-18 01:21:46.062311
+
+"""
+
+# revision identifiers, used by Alembic.
+revision = '4f399b21ae71'
+down_revision = '1e1d5ace7dc6'
+
+from alembic import op
+import sqlalchemy as sa
+
+
+def upgrade():
+    op.add_column('nodes', sa.Column('clean_step', sa.Text(),
+                  nullable=True))
+
+
+def downgrade():
+    op.drop_column('nodes', 'clean_step')
--- a/ironic/db/sqlalchemy/models.py
+++ b/ironic/db/sqlalchemy/models.py
@ -167,6 +167,7 @@ class Node(Base):
    driver = Column(String(15))
    driver_info = Column(JSONEncodedDict)
    driver_internal_info = Column(JSONEncodedDict)
+    clean_step = Column(JSONEncodedDict)

    # NOTE(deva): this is the host name of the conductor which has
    #             acquired a TaskManager lock on the node.
--- a/ironic/drivers/base.py
+++ b/ironic/drivers/base.py
@ -128,7 +128,8 @@ class BaseInterface(object):
    interface_type = 'base'

    def __new__(cls, *args, **kwargs):
-        # Cache the clean step iteration. We use __new__ instead of __init___
+        # Get the list of clean steps when the interface is initialized by
+        # the conductor. We use __new__ instead of __init___
        # to avoid breaking backwards compatibility with all the drivers.
        # We want to return all steps, regardless of priority.
        instance = super(BaseInterface, cls).__new__(cls, *args, **kwargs)
@ -140,20 +141,28 @@ class BaseInterface(object):
                        'priority': method._clean_step_priority,
                        'interface': instance.interface_type}
                instance.clean_steps.append(step)
-        LOG.debug('Found clean steps %(steps)s for interface %(interface)s' %
+        LOG.debug('Found clean steps %(steps)s for interface %(interface)s',
                  {'steps': instance.clean_steps,
                   'interface': instance.interface_type})
        return instance

-    def get_clean_steps(self):
-        """Get a list of enabled and disabled CleanSteps for the interface."""
+    def get_clean_steps(self, task):
+        """Get a list of (enabled and disabled) clean steps for the interface.
+
+        This function will return all clean steps (both enabled and disabled)
+        for the interface, in an unordered list.
+
+        :param task: A TaskManager object, useful for interfaces overriding
+            this function
+        :returns: A list of clean step dictionaries
+        """
        return self.clean_steps

    def execute_clean_step(self, task, step):
        """Execute the clean step on task.node.

-        Clean steps should take a single argument: a TaskManager object.
-        Steps can be executed synchronously or asynchronously. Steps should
+        A clean step should take a single argument: a TaskManager object.
+        A step can be executed synchronously or asynchronously. A step should
        return None if the method has completed synchronously or
        states.CLEANING if the step will continue to execute asynchronously.
        If the step executes asynchronously, it should issue a call to the
@ -161,12 +170,12 @@ class BaseInterface(object):
        clean step.

        :param task: A TaskManager object
-        :param step: A CleanStep object to execute
+        :param step: The clean step dictionary representing the step to execute
        :returns: None if this method has completed synchronously, or
            states.CLEANING if the step will continue to execute
            asynchronously.
        """
-        return getattr(self, step.get('step'))(task)
+        return getattr(self, step['step'])(task)


@six.add_metaclass(abc.ABCMeta)
@ -273,6 +282,42 @@ class DeployInterface(BaseInterface):
        :param task: a TaskManager instance containing the node to act on.
        """

+    def prepare_cleaning(self, task):
+        """Prepare the node for cleaning or zapping tasks.
+
+        For example, nodes that use the Ironic Python Agent will need to
+        boot the ramdisk in order to do in-band cleaning and zapping tasks.
+
+        If the function is asynchronous, the driver will need to handle
+        settings node.driver_internal_info['clean_steps'] and node.clean_step,
+        as they would be set in ironic.conductor.manager._do_node_clean,
+        but cannot be set when this is asynchronous. After, the interface
+        should make an RPC call to continue_node_cleaning to start cleaning.
+
+        NOTE(JoshNang) this should be moved to BootInterface when it gets
+        implemented.
+
+        :param task: a TaskManager instance containing the node to act on.
+        :returns: If this function is going to be asynchronous, should return
+            `states.CLEANING`. Otherwise, should return `None`. The interface
+            will need to call _get_cleaning_steps and then RPC to
+            continue_node_cleaning
+        """
+        pass
+
+    def tear_down_cleaning(self, task):
+        """Tear down after cleaning or zapping is completed.
+
+        Given that cleaning or zapping is complete, do all cleanup and tear
+        down necessary to allow the node to be deployed to again.
+
+        NOTE(JoshNang) this should be moved to BootInterface when it gets
+        implemented.
+
+        :param task: a TaskManager instance containing the node to act on.
+        """
+        pass
+

@six.add_metaclass(abc.ABCMeta)
 class PowerInterface(BaseInterface):
--- a/ironic/objects/node.py
+++ b/ironic/objects/node.py
@ -35,7 +35,8 @@ class Node(base.IronicObject):
    # Version 1.8: Add maintenance_reason
    # Version 1.9: Add driver_internal_info
    # Version 1.10: Add name and get_by_name()
-    VERSION = '1.10'
+    # Version 1.11: Add clean_step
+    VERSION = '1.11'

    dbapi = db_api.get_instance()

@ -51,6 +52,11 @@ class Node(base.IronicObject):
            'driver_info': obj_utils.dict_or_none,
            'driver_internal_info': obj_utils.dict_or_none,

+            # A clean step dictionary, indicating the current clean step
+            # being executed, or None, indicating cleaning is not in progress
+            # or has not yet started.
+            'clean_step': obj_utils.dict_or_none,
+
            'instance_info': obj_utils.dict_or_none,
            'properties': obj_utils.dict_or_none,
            'reservation': obj_utils.str_or_none,
--- a/ironic/tests/conductor/test_manager.py
+++ b/ironic/tests/conductor/test_manager.py
@ -1531,9 +1531,143 @@ class DoNodeDeployTearDownTestCase(_ServiceSetUpMixin,
        self.assertIsNone(node.last_error)
        mock_spawn.assert_called_with(self.service._do_node_clean, mock.ANY)

+
+@_mock_record_keepalive
+class DoNodeCleanTestCase(_ServiceSetUpMixin, tests_db_base.DbTestCase):
+    def setUp(self):
+        super(DoNodeCleanTestCase, self).setUp()
+        self.config(clean_nodes=True, group='conductor')
+        self.power_update = {
+            'step': 'update_firmware', 'priority': 10, 'interface': 'power'}
+        self.deploy_update = {
+            'step': 'update_firmware', 'priority': 10, 'interface': 'deploy'}
+        self.deploy_erase = {
+            'step': 'erase_disks', 'priority': 20, 'interface': 'deploy'}
+        # Cleaning should be executed in this order
+        self.clean_steps = [self.deploy_erase, self.power_update,
+                            self.deploy_update]
+        # Zap step
+        self.deploy_raid = {
+            'step': 'build_raid', 'priority': 0, 'interface': 'deploy'}
+
+    @mock.patch('ironic.drivers.modules.fake.FakeDeploy.get_clean_steps')
+    @mock.patch('ironic.drivers.modules.fake.FakePower.get_clean_steps')
+    def test__get_cleaning_steps(self, mock_power_steps, mock_deploy_steps):
+        # Test getting cleaning steps, with one driver returning None, two
+        # conflicting priorities, and asserting they are ordered properly.
+        node = obj_utils.create_test_node(
+            self.context, driver='fake',
+            provision_state=states.CLEANING,
+            target_provision_state=states.AVAILABLE)
+
+        mock_power_steps.return_value = [self.power_update]
+        mock_deploy_steps.return_value = [self.deploy_erase,
+                                          self.deploy_update]
+
+        with task_manager.acquire(
+                self.context, node['id'], shared=False) as task:
+            steps = manager._get_cleaning_steps(task, enabled=False)
+
+        self.assertEqual(self.clean_steps, steps)
+
+    @mock.patch('ironic.drivers.modules.fake.FakeDeploy.get_clean_steps')
+    @mock.patch('ironic.drivers.modules.fake.FakePower.get_clean_steps')
+    def test__get_cleaning_steps_only_enabled(self, mock_power_steps,
+                                              mock_deploy_steps):
+        # Test getting only cleaning steps, with one driver returning None, two
+        # conflicting priorities, and asserting they are ordered properly.
+        # Should discard zap step
+        node = obj_utils.create_test_node(
+            self.context, driver='fake',
+            provision_state=states.CLEANING,
+            target_provision_state=states.AVAILABLE)
+
+        mock_power_steps.return_value = [self.power_update]
+        mock_deploy_steps.return_value = [self.deploy_erase,
+                                          self.deploy_update,
+                                          self.deploy_raid]
+
+        with task_manager.acquire(
+                self.context, node['id'], shared=True) as task:
+            steps = manager._get_cleaning_steps(task, enabled=True)
+
+        self.assertEqual(self.clean_steps, steps)
+
+    @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker')
+    def test_continue_node_clean_worker_pool_full(self, mock_spawn):
+        # Test the appropriate exception is raised if the worker pool is full
+        prv_state = states.CLEANING
+        tgt_prv_state = states.AVAILABLE
+        node = obj_utils.create_test_node(self.context, driver='fake',
+                                          provision_state=prv_state,
+                                          target_provision_state=tgt_prv_state,
+                                          last_error=None)
+        self._start_service()
+
+        mock_spawn.side_effect = exception.NoFreeConductorWorker()
+
+        exc = self.assertRaises(messaging.rpc.ExpectedException,
+                                self.service.continue_node_clean,
+                                self.context, node.uuid)
+        # Compare true exception hidden by @messaging.expected_exceptions
+        self.assertEqual(exception.NoFreeConductorWorker, exc.exc_info[0])
+
+        self.service._worker_pool.waitall()
+        node.refresh()
+        # Make sure things were rolled back
+        self.assertEqual(prv_state, node.provision_state)
+        self.assertEqual(tgt_prv_state, node.target_provision_state)
+
+    @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker')
+    def test_continue_node_clean_wrong_state(self, mock_spawn):
+        # Test the appropriate exception is raised if node isn't already
+        # in CLEANING state
+        prv_state = states.DELETING
+        tgt_prv_state = states.AVAILABLE
+        node = obj_utils.create_test_node(self.context, driver='fake',
+                                          provision_state=prv_state,
+                                          target_provision_state=tgt_prv_state,
+                                          last_error=None)
+        self._start_service()
+
+        exc = self.assertRaises(messaging.rpc.ExpectedException,
+                                self.service.continue_node_clean,
+                                self.context, node.uuid)
+        # Compare true exception hidden by @messaging.expected_exceptions
+        self.assertEqual(exception.InvalidStateRequested, exc.exc_info[0])
+
+        self.service._worker_pool.waitall()
+        node.refresh()
+        # Make sure things were rolled back
+        self.assertEqual(prv_state, node.provision_state)
+        self.assertEqual(tgt_prv_state, node.target_provision_state)
+        # Verify reservation has been cleared.
+        self.assertIsNone(node.reservation)
+
+    @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker')
+    def test_continue_node_clean(self, mock_spawn):
+        # test a node can continue cleaning via RPC
+        prv_state = states.CLEANING
+        tgt_prv_state = states.AVAILABLE
+        driver_info = {'clean_steps': self.clean_steps}
+        node = obj_utils.create_test_node(self.context, driver='fake',
+                                          provision_state=prv_state,
+                                          target_provision_state=tgt_prv_state,
+                                          last_error=None,
+                                          driver_internal_info=driver_info,
+                                          clean_step=self.clean_steps[1])
+        self._start_service()
+        self.service.continue_node_clean(self.context, node.uuid)
+        self.service._worker_pool.waitall()
+        node.refresh()
+        mock_spawn.assert_called_with(self.service._do_next_clean_step,
+                                      mock.ANY, self.clean_steps,
+                                      self.clean_steps[1])
+
    @mock.patch('ironic.drivers.modules.fake.FakePower.validate')
    def test__do_node_clean_validate_fail(self, mock_validate):
        # InvalidParameterValue should be cause node to go to CLEANFAIL
+        self.config(clean_nodes=True, group='conductor')
        mock_validate.side_effect = exception.InvalidParameterValue('error')
        node = obj_utils.create_test_node(
            self.context, driver='fake',
@ -1545,6 +1679,271 @@ class DoNodeDeployTearDownTestCase(_ServiceSetUpMixin,
        node.refresh()
        self.assertEqual(states.CLEANFAIL, node.provision_state)

+    @mock.patch('ironic.drivers.modules.fake.FakePower.validate')
+    def test__do_node_clean_disabled(self, mock_validate):
+        self.config(clean_nodes=False, group='conductor')
+        node = obj_utils.create_test_node(
+            self.context, driver='fake',
+            provision_state=states.CLEANING,
+            target_provision_state=states.AVAILABLE,
+            last_error=None)
+
+        self._start_service()
+        with task_manager.acquire(
+                self.context, node['id'], shared=False) as task:
+            self.service._do_node_clean(task)
+        self.service._worker_pool.waitall()
+        node.refresh()
+
+        # Assert that the node was moved to available without cleaning
+        mock_validate.assert_not_called()
+        self.assertEqual(states.AVAILABLE, node.provision_state)
+        self.assertEqual(states.NOSTATE, node.target_provision_state)
+        self.assertEqual({}, node.clean_step)
+        self.assertIsNone(node.driver_internal_info.get('clean_steps'))
+
+    @mock.patch('ironic.conductor.manager.set_node_cleaning_steps')
+    @mock.patch('ironic.conductor.manager.ConductorManager.'
+                '_do_next_clean_step')
+    @mock.patch('ironic.drivers.modules.fake.FakePower.validate')
+    def test__do_node_clean(self, mock_validate, mock_next_step, mock_steps):
+        node = obj_utils.create_test_node(
+            self.context, driver='fake',
+            provision_state=states.CLEANING,
+            target_provision_state=states.AVAILABLE,
+            last_error=None,
+            power_state=states.POWER_OFF,
+            driver_internal_info={'clean_steps': []})
+
+        mock_steps.return_value = self.clean_steps
+
+        self._start_service()
+        with task_manager.acquire(
+                self.context, node['id'], shared=False) as task:
+            self.service._do_node_clean(task)
+
+        self.service._worker_pool.waitall()
+        node.refresh()
+
+        mock_validate.assert_called_once()
+        mock_next_step.assert_called_once_with(mock.ANY, [], {})
+        mock_steps.assert_called_once()
+
+        # Check that state didn't change
+        self.assertEqual(states.CLEANING, node.provision_state)
+        self.assertEqual(states.AVAILABLE, node.target_provision_state)
+
+    @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step')
+    def test__do_next_clean_step_first_step_async(self, mock_execute):
+        # Execute the first async clean step on a node
+        node = obj_utils.create_test_node(
+            self.context, driver='fake',
+            provision_state=states.CLEANING,
+            target_provision_state=states.AVAILABLE,
+            last_error=None,
+            clean_step={})
+        mock_execute.return_value = states.CLEANING
+
+        self._start_service()
+
+        with task_manager.acquire(
+                self.context, node['id'], shared=False) as task:
+            self.service._do_next_clean_step(task, self.clean_steps,
+                                             node.clean_step)
+
+        self.service._worker_pool.waitall()
+        node.refresh()
+
+        self.assertEqual(states.CLEANING, node.provision_state)
+        self.assertEqual(self.clean_steps[0], node.clean_step)
+        mock_execute.assert_called_once_with(mock.ANY, self.clean_steps[0])
+
+    @mock.patch('ironic.drivers.modules.fake.FakePower.execute_clean_step')
+    def test__do_next_clean_step_continue_from_last_step(self, mock_execute):
+        # Resume an in-progress cleaning after the first async step
+        node = obj_utils.create_test_node(
+            self.context, driver='fake',
+            provision_state=states.CLEANING,
+            target_provision_state=states.AVAILABLE,
+            last_error=None,
+            clean_step=self.clean_steps[0])
+        mock_execute.return_value = states.CLEANING
+
+        self._start_service()
+
+        with task_manager.acquire(
+                self.context, node['id'], shared=False) as task:
+            self.service._do_next_clean_step(task, self.clean_steps,
+                                             self.clean_steps[0])
+
+        self.service._worker_pool.waitall()
+        node.refresh()
+
+        self.assertEqual(states.CLEANING, node.provision_state)
+        self.assertEqual(self.clean_steps[1], node.clean_step)
+        mock_execute.assert_called_once_with(mock.ANY, self.clean_steps[1])
+
+    @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step')
+    def test__do_next_clean_step_last_step_noop(self, mock_execute):
+        # Resume where last_step is the last cleaning step, should be noop
+        node = obj_utils.create_test_node(
+            self.context, driver='fake',
+            provision_state=states.CLEANING,
+            target_provision_state=states.AVAILABLE,
+            last_error=None,
+            clean_step=self.clean_steps[-1])
+
+        self._start_service()
+
+        with task_manager.acquire(
+                self.context, node['id'], shared=False) as task:
+            self.service._do_next_clean_step(
+                task, self.clean_steps, self.clean_steps[-1])
+
+        self.service._worker_pool.waitall()
+        node.refresh()
+
+        # Cleaning should be complete without calling additional steps
+        self.assertEqual(states.AVAILABLE, node.provision_state)
+        self.assertEqual({}, node.clean_step)
+        mock_execute.assert_not_called()
+
+    @mock.patch('ironic.drivers.modules.fake.FakePower.execute_clean_step')
+    @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step')
+    def test__do_next_clean_step_all(self, mock_deploy_execute,
+                                     mock_power_execute):
+        # Run all steps from start to finish (all synchronous)
+        node = obj_utils.create_test_node(
+            self.context, driver='fake',
+            provision_state=states.CLEANING,
+            target_provision_state=states.AVAILABLE,
+            last_error=None,
+            clean_step={})
+        mock_deploy_execute.return_value = None
+        mock_power_execute.return_value = None
+
+        self._start_service()
+
+        with task_manager.acquire(
+                self.context, node['id'], shared=False) as task:
+            self.service._do_next_clean_step(
+                task, self.clean_steps, node.clean_step)
+
+        self.service._worker_pool.waitall()
+        node.refresh()
+
+        # Cleaning should be complete
+        self.assertEqual(states.AVAILABLE, node.provision_state)
+        self.assertEqual({}, node.clean_step)
+        mock_power_execute.assert_called_once_with(mock.ANY,
+                                                   self.clean_steps[1])
+        mock_deploy_execute.assert_has_calls = [
+            mock.call(self.clean_steps[0]),
+            mock.call(self.clean_steps[2])
+        ]
+
+    @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step')
+    def test__do_next_clean_step_bad_last_step(self, mock_execute):
+        # Make sure cleaning fails if last_step is incorrect
+        node = obj_utils.create_test_node(
+            self.context, driver='fake',
+            provision_state=states.CLEANING,
+            target_provision_state=states.AVAILABLE,
+            last_error=None,
+            clean_step={})
+
+        self._start_service()
+
+        with task_manager.acquire(
+                self.context, node['id'], shared=False) as task:
+            self.service._do_next_clean_step(
+                task, self.clean_steps, {'interface': 'deploy',
+                                         'step': 'not_a_clean_step',
+                                         'priority': 100})
+
+        self.service._worker_pool.waitall()
+        node.refresh()
+
+        # Node should have failed without executing anything
+        self.assertEqual(states.CLEANFAIL, node.provision_state)
+        self.assertEqual({}, node.clean_step)
+        self.assertIsNotNone(node.last_error)
+        self.assertTrue(node.maintenance)
+        mock_execute.assert_not_called()
+
+    @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step')
+    def test__do_next_clean_step_fail(self, mock_execute):
+        # When a clean step fails, go to CLEANFAIL
+        node = obj_utils.create_test_node(
+            self.context, driver='fake',
+            provision_state=states.CLEANING,
+            target_provision_state=states.AVAILABLE,
+            last_error=None,
+            clean_step={})
+        mock_execute.side_effect = Exception()
+
+        self._start_service()
+
+        with task_manager.acquire(
+                self.context, node['id'], shared=False) as task:
+            self.service._do_next_clean_step(
+                task, self.clean_steps, node.clean_step)
+
+        self.service._worker_pool.waitall()
+        node.refresh()
+
+        # Make sure we go to CLEANFAIL, clear clean_steps
+        self.assertEqual(states.CLEANFAIL, node.provision_state)
+        self.assertEqual({}, node.clean_step)
+        self.assertIsNotNone(node.last_error)
+        self.assertTrue(node.maintenance)
+        mock_execute.assert_not_called()
+        mock_execute.assert_called_once_with(mock.ANY, self.clean_steps[0])
+
+    @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step')
+    def test__do_next_clean_step_no_steps(self, mock_execute):
+        # Resume where there are no steps, should be a noop
+        node = obj_utils.create_test_node(
+            self.context, driver='fake',
+            provision_state=states.CLEANING,
+            target_provision_state=states.AVAILABLE,
+            last_error=None,
+            clean_step={})
+
+        self._start_service()
+
+        with task_manager.acquire(
+                self.context, node['id'], shared=False) as task:
+            self.service._do_next_clean_step(
+                task, [], node.clean_step)
+
+        self.service._worker_pool.waitall()
+        node.refresh()
+
+        # Cleaning should be complete without calling additional steps
+        self.assertEqual(states.AVAILABLE, node.provision_state)
+        self.assertEqual({}, node.clean_step)
+        mock_execute.assert_not_called()
+
+    @mock.patch('ironic.conductor.manager._get_cleaning_steps')
+    def test_set_node_cleaning_steps(self, mock_steps):
+        mock_steps.return_value = self.clean_steps
+
+        node = obj_utils.create_test_node(
+            self.context, driver='fake',
+            provision_state=states.CLEANING,
+            target_provision_state=states.AVAILABLE,
+            last_error=None,
+            clean_step=None)
+
+        with task_manager.acquire(
+                self.context, node['id'], shared=False) as task:
+            manager.set_node_cleaning_steps(task)
+            node.refresh()
+            self.assertEqual(self.clean_steps,
+                             task.node.driver_internal_info['clean_steps'])
+            self.assertEqual({}, node.clean_step)
+

@_mock_record_keepalive
 class MiscTestCase(_ServiceSetUpMixin, _CommonMixIn, tests_db_base.DbTestCase):
--- a/ironic/tests/conductor/test_rpcapi.py
+++ b/ironic/tests/conductor/test_rpcapi.py
@ -288,3 +288,9 @@ class RPCAPITestCase(base.DbTestCase):
                          'call',
                          version='1.24',
                          node_id=self.fake_node['uuid'])
+
+    def test_continue_node_clean(self):
+        self._test_rpcapi('continue_node_clean',
+                          'call',
+                          version='1.26',
+                          node_id=self.fake_node['uuid'])
--- a/ironic/tests/db/sqlalchemy/test_migrations.py
+++ b/ironic/tests/db/sqlalchemy/test_migrations.py
@ -367,6 +367,13 @@ class MigrationCheckersMixin(object):
        self.assertIsInstance(nodes.c.driver_internal_info.type,
                              sqlalchemy.types.TEXT)

+    def _check_4f399b21ae71(self, engine, data):
+        nodes = db_utils.get_table(engine, 'nodes')
+        col_names = [column.name for column in nodes.c]
+        self.assertIn('clean_step', col_names)
+        self.assertIsInstance(nodes.c.clean_step.type,
+                              sqlalchemy.types.String)
+
    def test_upgrade_and_version(self):
        with patch_with_engine(self.engine):
            self.migration_api.upgrade('head')
--- a/ironic/tests/db/utils.py
+++ b/ironic/tests/db/utils.py
@ -201,6 +201,7 @@ def get_test_node(**kw):
        'driver': kw.get('driver', 'fake'),
        'driver_info': kw.get('driver_info', fake_info),
        'driver_internal_info': kw.get('driver_internal_info', fake_info),
+        'clean_step': kw.get('clean_step'),
        'properties': kw.get('properties', properties),
        'reservation': kw.get('reservation'),
        'maintenance': kw.get('maintenance', False),
--- a/ironic/tests/drivers/test_base.py
+++ b/ironic/tests/drivers/test_base.py
@ -152,25 +152,33 @@ class CleanStepTestCase(base.TestCase):
        obj = TestClass()
        obj2 = TestClass2()

-        self.assertEqual(2, len(obj.get_clean_steps()))
+        self.assertEqual(2, len(obj.get_clean_steps(task_mock)))
        # Ensure the steps look correct
-        self.assertEqual(10, obj.get_clean_steps()[0]['priority'])
-        self.assertEqual('test', obj.get_clean_steps()[0]['interface'])
-        self.assertEqual('clean_method', obj.get_clean_steps()[0]['step'])
-        self.assertEqual(0, obj.get_clean_steps()[1]['priority'])
-        self.assertEqual('test', obj.get_clean_steps()[1]['interface'])
-        self.assertEqual('zap_method', obj.get_clean_steps()[1]['step'])
+        self.assertEqual(10, obj.get_clean_steps(task_mock)[0]['priority'])
+        self.assertEqual('test', obj.get_clean_steps(
+            task_mock)[0]['interface'])
+        self.assertEqual('clean_method', obj.get_clean_steps(
+            task_mock)[0]['step'])
+        self.assertEqual(0, obj.get_clean_steps(task_mock)[1]['priority'])
+        self.assertEqual('test', obj.get_clean_steps(
+            task_mock)[1]['interface'])
+        self.assertEqual('zap_method', obj.get_clean_steps(
+            task_mock)[1]['step'])

        # Ensure the second obj get different clean steps
-        self.assertEqual(2, len(obj2.get_clean_steps()))
+        self.assertEqual(2, len(obj2.get_clean_steps(task_mock)))
        # Ensure the steps look correct
-        self.assertEqual(20, obj2.get_clean_steps()[0]['priority'])
-        self.assertEqual('test2', obj2.get_clean_steps()[0]['interface'])
-        self.assertEqual('clean_method2', obj2.get_clean_steps()[0]['step'])
-        self.assertEqual(0, obj2.get_clean_steps()[1]['priority'])
-        self.assertEqual('test2', obj2.get_clean_steps()[1]['interface'])
-        self.assertEqual('zap_method2', obj2.get_clean_steps()[1]['step'])
+        self.assertEqual(20, obj2.get_clean_steps(task_mock)[0]['priority'])
+        self.assertEqual('test2', obj2.get_clean_steps(
+            task_mock)[0]['interface'])
+        self.assertEqual('clean_method2', obj2.get_clean_steps(
+            task_mock)[0]['step'])
+        self.assertEqual(0, obj2.get_clean_steps(task_mock)[1]['priority'])
+        self.assertEqual('test2', obj2.get_clean_steps(
+            task_mock)[1]['interface'])
+        self.assertEqual('zap_method2', obj2.get_clean_steps(
+            task_mock)[1]['step'])

        # Ensure we can execute the function.
-        obj.execute_clean_step(task_mock, obj.get_clean_steps()[0])
+        obj.execute_clean_step(task_mock, obj.get_clean_steps(task_mock)[0])
        method_mock.assert_called_once_with(task_mock)