From f53ccb647f2e11c0d24581d97168039af0b5da22 Mon Sep 17 00:00:00 2001 From: Julia Kreger Date: Tue, 26 Sep 2023 09:27:17 -0700 Subject: [PATCH] Handle Power On/Off for child node cases So, there are cases where say you may have multiple DPUs in a physical server, each card when fully operating can consume 100-150 watts. In some cases, these cards can have external power supplies, but need the physical host in a running power-on state in order for the device to be powered on. Conversely, we also now power off the child nodes if the parent node has been requested to be powered off, since we *really* don't want cause inadvertent harm to the child node. This is realistically a fix we should backport once we sort through the details, if we agree this makes sense to do, as is. Change-Id: Ib2bfe04cdaa82264ba8bb1e71477899bb6268179 --- doc/source/admin/cleaning.rst | 17 ++ doc/source/admin/steps.rst | 28 +++ ironic/common/exception.py | 14 ++ ironic/conductor/utils.py | 84 ++++++- ironic/db/api.py | 18 ++ ironic/db/sqlalchemy/api.py | 21 ++ ironic/objects/node.py | 11 + ironic/tests/unit/conductor/test_cleaning.py | 29 ++- ironic/tests/unit/conductor/test_servicing.py | 30 ++- ironic/tests/unit/conductor/test_utils.py | 232 ++++++++++++++++++ ...-ops-for-child-nodes-67a11f1900ce137a.yaml | 11 + 11 files changed, 476 insertions(+), 19 deletions(-) create mode 100644 releasenotes/notes/handle-power-ops-for-child-nodes-67a11f1900ce137a.yaml diff --git a/doc/source/admin/cleaning.rst b/doc/source/admin/cleaning.rst index 04e84617d1..63aa294439 100644 --- a/doc/source/admin/cleaning.rst +++ b/doc/source/admin/cleaning.rst @@ -478,6 +478,23 @@ power will be turned off via the management interface. Afterwards, the the ``ironic-python-agent`` running on any child nodes. This constraint may be changed in the future. +Power Management with Child Nodes +--------------------------------- + +The mix of child nodes and parent nodes has special power considerations, +and these devices are evolving in the industry. That being said, the Ironic +project has taken an approach of explicitly attempting to "power on" any +parent node when a request comes in to "power on" a child node. This can be +bypassed by setting a ``driver_info`` parameter ``has_dedicated_power_supply`` +set to ``True``, in recognition that some hardware vendors are working on +supplying independent power to these classes of devices to meet their customer +use cases. + +Similarly to the case of a "power on" request for a child node, when power +is requested to be turned off for a "parent node", Ironic will issue +"power off" commands for all child nodes unless the child node has the +``has_dedicated_power_supply`` option set in the node's ``driver_info`` field. + Troubleshooting =============== If cleaning fails on a node, the node will be put into ``clean failed`` state. diff --git a/doc/source/admin/steps.rst b/doc/source/admin/steps.rst index 4bd270a141..86f5664273 100644 --- a/doc/source/admin/steps.rst +++ b/doc/source/admin/steps.rst @@ -99,3 +99,31 @@ a ``clean hold`` state. Once you have completed whatever action which needed to be performed while the node was in a held state, you will need to issue an unhold provision state command, via the API or command line to inform the node to proceed. + +Set the environment +=================== + +When using steps with the functionality to execute on child nodes, +i.e. nodes who a populated ``parent_node`` field, you always want to +ensure you have set the environment appropriately for your next action. + +For example, if you are executing steps against a parent node, which then +execute against a child node via the ``execute_on_child_nodes`` step option, +and it requires power to be on, you will want to explicitly +ensure the power is on for the parent node **unless** the child node can +operate independently, as signaled through the driver_info option +``has_dedicated_power_supply`` on the child node. Power is an obvious +case because Ironic has guarding logic internally to attempt to power-on the +parent node, but it cannot be an after thought due to internal task locking. + +Power specifically aside, the general principle applies to the execution +of all steps. You need always want to build upon the prior step or existing +existing known state of the system. + +.. NOTE:: + Ironic will attempt to ensure power is active for a ``parent_node`` when + powering on a child node. Conversely, Ironic will also attempt to power + down child nodes if a parent node is requested to be turned off, unless + the ``has_dedicated_power_supply`` option is set for the child node. + This pattern of behavior prevents parent nodes from being automatically + powered back on should a child node be left online. diff --git a/ironic/common/exception.py b/ironic/common/exception.py index 6ff4494ce8..9080000ae7 100644 --- a/ironic/common/exception.py +++ b/ironic/common/exception.py @@ -931,3 +931,17 @@ class ImageChecksumFileReadFailure(InvalidImage): _msg_fmt = _("Failed to read the file from local storage " "to perform a checksum operation.") code = http_client.SERVICE_UNAVAILABLE + + +class ParentNodeLocked(Conflict): + _msg_fmt = _("Node %(node)s parent_node %(parent)s is presently locked " + "and we are unable to perform any action on it at this " + "time. Please retry after the current operation is " + "completed.") + + +class ChildNodeLocked(Conflict): + _msg_fmt = _("Node %(node)s child_node %(child)s is presently locked " + "and we are unable to perform any action on it at this " + "time. Please retry after the current operation is " + "completed.") diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 65d3a424c7..2cbf99ca80 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -290,7 +290,12 @@ def node_power_action(task, new_state, timeout=None): storage interface upon setting power on. :raises: other exceptions by the node's power driver if something wrong occurred during the power action. - + :raises: ChildNodeLocked when a child node must be acted upon due + to this request, but we cannot act upon it due to a lock + being held. + :raises: ParentNodeLocked when a parent node must be acted upon + due to this request, but we cannot act upon it due to a + lock being held. """ notify_utils.emit_power_set_notification( task, fields.NotificationLevel.INFO, fields.NotificationStatus.START, @@ -313,6 +318,15 @@ def node_power_action(task, new_state, timeout=None): node.save() return + # Parent node power required? + if task.node.parent_node: + # If we have a parent node defined for the device, we likely + # need to turn the power on. + parent_power_needed = not task.node.driver_info.get( + 'has_dedicated_power_supply', False) + else: + parent_power_needed = False + # Set the target_power_state and clear any last_error, if we're # starting a new operation. This will expose to other processes # and clients that work is in progress. Keep the last_error intact @@ -336,6 +350,12 @@ def node_power_action(task, new_state, timeout=None): task.driver.storage.attach_volumes(task) if new_state != states.REBOOT: + if new_state == states.POWER_ON and parent_power_needed: + _handle_child_power_on(task, target_state, timeout) + if new_state == states.POWER_OFF: + _handle_child_power_off(task, target_state, timeout) + # Parent/Child dependencies handled, we can take care of + # the original node now. task.driver.power.set_power_state(task, new_state, timeout=timeout) else: # TODO(TheJulia): We likely ought to consider toggling @@ -384,6 +404,68 @@ def node_power_action(task, new_state, timeout=None): {'node': node.uuid, 'error': e}) +def _handle_child_power_on(task, target_state, timeout): + """Actions related to powering on a parent for a child node.""" + with task_manager.acquire( + task.context, task.node.parent_node, + shared=True, + purpose='power on parent node') as pn_task: + if (pn_task.driver.power.get_power_state(pn_task) + != states.POWER_ON): + try: + pn_task.upgrade_lock() + except exception.NodeLocked as e: + LOG.error('Cannot power on parent_node %(pn)s ' + 'as it is locked by %(host)s. We are ' + 'unable to proceed with the current ' + 'operation for child node %(node)s. ' + 'Error: %(error)s', + {'pn': pn_task.node.uuid, + 'host': pn_task.node.reservation, + 'node': task.node.uuid, + 'error': e}) + raise exception.ParentNodeLocked( + node=task.node.uuid, parent=pn_task.node.uuid) + node_power_action(pn_task, target_state, timeout) + + +def _handle_child_power_off(task, target_state, timeout): + """Actions related to powering off child nodes.""" + child_nodes = task.node.list_child_node_ids(exclude_dedicated_power=True) + cn_tasks = [] + try: + for child in child_nodes: + # Get the task for the node, check the status, add + # the list of locks to act upon. + cn_task = task_manager.acquire( + task.context, child, shared=True, + purpose='child node power off') + if (cn_task.driver.power.get_power_state(cn_task) + != states.POWER_OFF): + cn_task.upgrade_lock() + cn_tasks.append(cn_task) + for cn_task in cn_tasks: + node_power_action(cn_task, target_state, timeout) + except exception.NodeLocked as e: + LOG.error( + 'Cannot power off child node %(cn)s ' + 'as it is locked by %(host)s. We are ' + 'unable to proceed with the current ' + 'operation for parent node %(node)s. ' + 'Error: %(error)s', + {'cn': cn_task.node.uuid, + 'host': cn_task.node.reservation, + 'node': task.node.uuid, + 'error': e}) + raise exception.ChildNodeLocked( + node=task.node.uuid, child=cn_task.node.uuid) + finally: + for cn_task in cn_tasks: + # We don't need to do anything else with the child node, + # and we need to release the child task's lock. + cn_task.release_resources() + + @task_manager.require_exclusive_lock def cleanup_after_timeout(task): """Cleanup deploy task after timeout. diff --git a/ironic/db/api.py b/ironic/db/api.py index b6f38a95d3..571c41bc01 100644 --- a/ironic/db/api.py +++ b/ironic/db/api.py @@ -1616,3 +1616,21 @@ class Connection(object, metaclass=abc.ABCMeta): :returns: A list of FirmwareComponent objects. :raises: NodeNotFound if the node is not found. """ + + @abc.abstractclassmethod + def get_child_node_ids_by_parent_uuid(self, node_uuid, + exclude_dedicated_power=False): + """Retrieve a list of child node IDs for a given parent UUID. + + This is an "internal" method, intended for use with power + management logic to facilitate power actions upon nodes, + where we obtain a list of nodes which requires additional + actions, and return *only* the required node IDs in order + to launch new tasks for power management activities. + + :param node_uuid: The uuid of the parent node, in order to + directly match the "parent_node" field. + :param exclude_dedicated_power: Boolean, False, if the list + should include child nodes with their own power supplies. + :returns: A list of tuples. + """ diff --git a/ironic/db/sqlalchemy/api.py b/ironic/db/sqlalchemy/api.py index 34896be037..3c5c847b79 100644 --- a/ironic/db/sqlalchemy/api.py +++ b/ironic/db/sqlalchemy/api.py @@ -3148,3 +3148,24 @@ class Connection(api.Connection): .filter_by(node_id=node_id) .all()) return result + + def get_child_node_ids_by_parent_uuid( + self, node_uuid, exclude_dedicated_power=False): + with _session_for_read() as session: + nodes = [] + res = session.execute(sa.select( + models.Node.id, + models.Node.driver_info + ).where(models.Node.parent_node == node_uuid)).all() + for r in res: + # NOTE(TheJulia): Dtantsur has noted we don't typically + # evaluate content data inside of the db api, and that + # we typically have the caller do it. We should likely + # refactor this, at some point for consistency if + # we're okay with the likely additional overhead + # to post filter an an initial result set. + if (exclude_dedicated_power + and r[1].get('has_dedicated_power_supply')): + continue + nodes.append(r[0]) + return nodes diff --git a/ironic/objects/node.py b/ironic/objects/node.py index 3a26e97ab2..19f0105484 100644 --- a/ironic/objects/node.py +++ b/ironic/objects/node.py @@ -773,6 +773,17 @@ class Node(base.IronicObject, object_base.VersionedObjectDictCompat): self.properties[key] = value self._changed_fields.add('properties') + def list_child_node_ids(self, exclude_dedicated_power=False): + """Returns a list of node IDs for child nodes, if any. + + :param exclude_dedicated_power: Boolean, Default False, if the list + should exclude nodes which are independently powered. + :returns: A list of any node_id values discovered in the database. + """ + return self.dbapi.get_child_node_ids_by_parent_uuid( + self.uuid, + exclude_dedicated_power=exclude_dedicated_power) + @base.IronicObjectRegistry.register class NodePayload(notification.NotificationPayloadBase): diff --git a/ironic/tests/unit/conductor/test_cleaning.py b/ironic/tests/unit/conductor/test_cleaning.py index a963536ec1..1906ce430b 100644 --- a/ironic/tests/unit/conductor/test_cleaning.py +++ b/ironic/tests/unit/conductor/test_cleaning.py @@ -1289,8 +1289,8 @@ class DoNodeCleanTestChildNodes(db_base.DbTestCase): def setUp(self): super(DoNodeCleanTestChildNodes, self).setUp() self.config(automated_clean=True, group='conductor') - self.power_off_parent = { - 'step': 'power_off', 'priority': 4, 'interface': 'power'} + self.power_on_parent = { + 'step': 'power_on', 'priority': 4, 'interface': 'power'} self.power_on_children = { 'step': 'power_on', 'priority': 5, 'interface': 'power', 'execute_on_child_nodes': True} @@ -1303,7 +1303,7 @@ class DoNodeCleanTestChildNodes(db_base.DbTestCase): self.power_on_parent = { 'step': 'power_on', 'priority': 15, 'interface': 'power'} self.clean_steps = [ - self.power_off_parent, + self.power_on_parent, self.power_on_children, self.update_firmware_on_children, self.reboot_children, @@ -1317,6 +1317,8 @@ class DoNodeCleanTestChildNodes(db_base.DbTestCase): driver_internal_info={'agent_secret_token': 'old', 'clean_steps': self.clean_steps}) + @mock.patch('ironic.drivers.modules.fake.FakePower.get_power_state', + autospec=True) @mock.patch('ironic.drivers.modules.fake.FakePower.reboot', autospec=True) @mock.patch('ironic.drivers.modules.fake.FakePower.set_power_state', @@ -1333,7 +1335,7 @@ class DoNodeCleanTestChildNodes(db_base.DbTestCase): autospec=True) def test_do_next_clean_step_with_children( self, mock_deploy, mock_mgmt, mock_power, mock_pv, mock_nv, - mock_sps, mock_reboot): + mock_sps, mock_reboot, mock_gps): child_node1 = obj_utils.create_test_node( self.context, uuid=uuidutils.generate_uuid(), @@ -1348,7 +1350,12 @@ class DoNodeCleanTestChildNodes(db_base.DbTestCase): last_error=None, power_state=states.POWER_OFF, parent_node=self.node.uuid) - + mock_gps.side_effect = [states.POWER_OFF, + states.POWER_ON, + states.POWER_OFF, + states.POWER_ON, + states.POWER_OFF, + states.POWER_ON] mock_deploy.return_value = None mock_mgmt.return_value = None mock_power.return_value = None @@ -1380,13 +1387,15 @@ class DoNodeCleanTestChildNodes(db_base.DbTestCase): self.assertEqual(0, mock_power.call_count) self.assertEqual(0, mock_nv.call_count) self.assertEqual(0, mock_pv.call_count) - self.assertEqual(4, mock_sps.call_count) + self.assertEqual(3, mock_sps.call_count) self.assertEqual(2, mock_reboot.call_count) mock_sps.assert_has_calls([ - mock.call(mock.ANY, mock.ANY, 'power off', timeout=None), + mock.call(mock.ANY, mock.ANY, 'power on', timeout=None), mock.call(mock.ANY, mock.ANY, 'power on', timeout=None), mock.call(mock.ANY, mock.ANY, 'power on', timeout=None)]) + @mock.patch('ironic.drivers.modules.fake.FakePower.get_power_state', + autospec=True) @mock.patch('ironic.drivers.modules.fake.FakePower.set_power_state', autospec=True) @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', @@ -1401,7 +1410,7 @@ class DoNodeCleanTestChildNodes(db_base.DbTestCase): autospec=True) def test_do_next_clean_step_with_children_by_uuid( self, mock_deploy, mock_mgmt, mock_power, mock_pv, mock_nv, - mock_sps): + mock_sps, mock_gps): child_node1 = obj_utils.create_test_node( self.context, uuid=uuidutils.generate_uuid(), @@ -1414,6 +1423,10 @@ class DoNodeCleanTestChildNodes(db_base.DbTestCase): driver='fake-hardware', last_error=None, parent_node=self.node.uuid) + # NOTE(TheJulia): This sets the stage for the test, + # child node power is off, parent node power is on. + mock_gps.side_effect = [states.POWER_OFF, + states.POWER_ON] power_on_children = { 'step': 'power_on', 'priority': 5, 'interface': 'power', 'execute_on_child_nodes': True, diff --git a/ironic/tests/unit/conductor/test_servicing.py b/ironic/tests/unit/conductor/test_servicing.py index 4604a9ad82..81e0d5d7e5 100644 --- a/ironic/tests/unit/conductor/test_servicing.py +++ b/ironic/tests/unit/conductor/test_servicing.py @@ -1006,8 +1006,8 @@ class DoNodeServiceAbortTestCase(db_base.DbTestCase): class DoNodeCleanTestChildNodes(db_base.DbTestCase): def setUp(self): super(DoNodeCleanTestChildNodes, self).setUp() - self.power_off_parent = { - 'step': 'power_off', 'priority': 4, 'interface': 'power'} + self.power_on_parent = { + 'step': 'power_on', 'priority': 4, 'interface': 'power'} self.power_on_children = { 'step': 'power_on', 'priority': 5, 'interface': 'power', 'execute_on_child_nodes': True} @@ -1017,14 +1017,15 @@ class DoNodeCleanTestChildNodes(db_base.DbTestCase): self.reboot_children = { 'step': 'reboot', 'priority': 5, 'interface': 'power', 'execute_on_child_nodes': True} - self.power_on_parent = { - 'step': 'power_on', 'priority': 15, 'interface': 'power'} + self.power_off_children = { + 'step': 'power_off', 'priority': 15, 'interface': 'power', + 'execute_on_child_nodes': True} self.service_steps = [ - self.power_off_parent, + self.power_on_parent, self.power_on_children, self.update_firmware_on_children, self.reboot_children, - self.power_on_parent] + self.power_off_children] self.node = obj_utils.create_test_node( self.context, driver='fake-hardware', provision_state=states.SERVICING, @@ -1034,6 +1035,8 @@ class DoNodeCleanTestChildNodes(db_base.DbTestCase): driver_internal_info={'agent_secret_token': 'old', 'service_steps': self.service_steps}) + @mock.patch('ironic.drivers.modules.fake.FakePower.get_power_state', + autospec=True) @mock.patch('ironic.drivers.modules.fake.FakePower.reboot', autospec=True) @mock.patch('ironic.drivers.modules.fake.FakePower.set_power_state', @@ -1050,7 +1053,7 @@ class DoNodeCleanTestChildNodes(db_base.DbTestCase): autospec=True) def test_do_next_clean_step_with_children( self, mock_deploy, mock_mgmt, mock_power, mock_pv, mock_nv, - mock_sps, mock_reboot): + mock_sps, mock_reboot, mock_gps): child_node1 = obj_utils.create_test_node( self.context, uuid=uuidutils.generate_uuid(), @@ -1066,6 +1069,13 @@ class DoNodeCleanTestChildNodes(db_base.DbTestCase): power_state=states.POWER_OFF, parent_node=self.node.uuid) + mock_gps.side_effect = [ + states.POWER_OFF, + states.POWER_ON, + states.POWER_OFF, + states.POWER_ON, + states.POWER_OFF, + states.POWER_ON] mock_deploy.return_value = None mock_mgmt.return_value = None mock_power.return_value = None @@ -1097,12 +1107,12 @@ class DoNodeCleanTestChildNodes(db_base.DbTestCase): self.assertEqual(0, mock_power.call_count) self.assertEqual(0, mock_nv.call_count) self.assertEqual(0, mock_pv.call_count) - self.assertEqual(4, mock_sps.call_count) + self.assertEqual(3, mock_sps.call_count) self.assertEqual(2, mock_reboot.call_count) mock_sps.assert_has_calls([ - mock.call(mock.ANY, mock.ANY, 'power off', timeout=None), mock.call(mock.ANY, mock.ANY, 'power on', timeout=None), - mock.call(mock.ANY, mock.ANY, 'power on', timeout=None)]) + mock.call(mock.ANY, mock.ANY, 'power on', timeout=None), + mock.call(mock.ANY, mock.ANY, 'power off', timeout=None)]) @mock.patch('ironic.drivers.modules.fake.FakePower.set_power_state', autospec=True) diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py index 798a29bc66..2cd80b591a 100644 --- a/ironic/tests/unit/conductor/test_utils.py +++ b/ironic/tests/unit/conductor/test_utils.py @@ -774,6 +774,238 @@ class NodePowerActionTestCase(db_base.DbTestCase): self.assertIsNone(node['target_power_state']) self.assertIsNone(node['last_error']) + @mock.patch.object(fake.FakePower, 'get_power_state', autospec=True) + def test_node_power_action_power_on_with_parent(self, get_power_mock): + """Test node_power_action to turns on a parent node""" + parent = obj_utils.create_test_node(self.context, + uuid=uuidutils.generate_uuid(), + driver='fake-hardware', + power_state=states.POWER_OFF) + + node = obj_utils.create_test_node(self.context, + uuid=uuidutils.generate_uuid(), + driver='fake-hardware', + power_state=states.POWER_OFF, + last_error='failed before', + parent_node=parent.uuid) + task = task_manager.TaskManager(self.context, node.uuid) + + get_power_mock.side_effect = states.POWER_OFF + + conductor_utils.node_power_action(task, states.POWER_ON) + + node.refresh() + parent.refresh() + # NOTE(TheJulia): We independently check power state + # in this code path so we end up with 3 calls, parent, + # parent as part of node_power_action, and then child. + self.assertEqual(3, get_power_mock.call_count) + self.assertEqual(states.POWER_ON, parent['power_state']) + self.assertEqual(states.POWER_ON, node['power_state']) + self.assertIsNone(parent['target_power_state']) + self.assertIsNone(parent['last_error']) + self.assertIsNone(node['target_power_state']) + self.assertIsNone(node['last_error']) + + @mock.patch.object(fake.FakePower, 'get_power_state', autospec=True) + def test_node_power_action_power_on_parent_off(self, get_power_mock): + """Test node_power_action to turns on a parent node""" + parent = obj_utils.create_test_node( + self.context, + name="parent_node", + uuid=uuidutils.generate_uuid(), + driver='fake-hardware', + power_state=states.POWER_OFF) + + node = obj_utils.create_test_node( + self.context, + name='child_node', + uuid=uuidutils.generate_uuid(), + driver='fake-hardware', + power_state=states.POWER_OFF, + last_error='failed before', + parent_node=parent.uuid, + driver_info={'has_dedicated_power_supply': True}) + + task = task_manager.TaskManager(self.context, node.uuid) + + get_power_mock.side_effect = states.POWER_OFF + + conductor_utils.node_power_action(task, states.POWER_ON) + + node.refresh() + parent.refresh() + self.assertEqual(1, get_power_mock.call_count) + self.assertEqual(states.POWER_OFF, parent['power_state']) + self.assertEqual(states.POWER_ON, node['power_state']) + self.assertIsNone(parent['target_power_state']) + self.assertIsNone(parent['last_error']) + self.assertIsNone(node['target_power_state']) + self.assertIsNone(node['last_error']) + + @mock.patch.object(fake.FakePower, 'get_power_state', autospec=True) + def test_node_power_action_power_off_parent(self, get_power_mock): + """Test node_power_action to turns on a parent node""" + parent = obj_utils.create_test_node( + self.context, + name="parent", + uuid=uuidutils.generate_uuid(), + driver='fake-hardware', + power_state=states.POWER_ON) + + node = obj_utils.create_test_node( + self.context, + name="child", + uuid=uuidutils.generate_uuid(), + driver='fake-hardware', + power_state=states.POWER_ON, + last_error='failed before', + parent_node=parent.uuid, + driver_info={'has_dedicated_power_supply': False}) + + task = task_manager.TaskManager(self.context, parent.uuid) + + get_power_mock.side_effect = states.POWER_ON + + conductor_utils.node_power_action(task, states.POWER_OFF) + + node.refresh() + parent.refresh() + self.assertEqual(3, get_power_mock.call_count) + self.assertEqual(states.POWER_OFF, parent['power_state']) + self.assertEqual(states.POWER_OFF, node['power_state']) + self.assertIsNone(parent['target_power_state']) + self.assertIsNone(parent['last_error']) + self.assertIsNone(node['target_power_state']) + self.assertIsNone(node['last_error']) + + @mock.patch.object(fake.FakePower, 'get_power_state', autospec=True) + def test_node_power_action_power_off_parent_child_remains( + self, get_power_mock): + """Test node_power_action to turns on a parent node""" + parent = obj_utils.create_test_node( + self.context, + name="parent", + uuid=uuidutils.generate_uuid(), + driver='fake-hardware', + power_state=states.POWER_ON) + + node = obj_utils.create_test_node( + self.context, + name="child", + uuid=uuidutils.generate_uuid(), + driver='fake-hardware', + power_state=states.POWER_ON, + last_error='failed before', + parent_node=parent.uuid, + driver_info={'has_dedicated_power_supply': True}) + + task = task_manager.TaskManager(self.context, parent.uuid) + + get_power_mock.side_effect = states.POWER_ON + + conductor_utils.node_power_action(task, states.POWER_OFF) + + node.refresh() + parent.refresh() + self.assertEqual(1, get_power_mock.call_count) + self.assertEqual(states.POWER_OFF, parent['power_state']) + self.assertEqual(states.POWER_ON, node['power_state']) + self.assertIsNone(parent['target_power_state']) + self.assertIsNone(parent['last_error']) + self.assertIsNone(node['target_power_state']) + self.assertIsNotNone(node['last_error']) + + @mock.patch.object(fake.FakePower, 'get_power_state', autospec=True) + def test_node_power_action_power_on_exception_if_parent_locked( + self, + get_power_mock): + """Test node_power_action to turns on a parent node""" + parent = obj_utils.create_test_node( + self.context, + uuid=uuidutils.generate_uuid(), + driver='fake-hardware', + power_state=states.POWER_OFF, + reservation='meow') + + node = obj_utils.create_test_node( + self.context, + uuid=uuidutils.generate_uuid(), + driver='fake-hardware', + power_state=states.POWER_OFF, + last_error='failed before', + parent_node=parent.uuid) + + task = task_manager.TaskManager(self.context, node.uuid) + + get_power_mock.side_effect = states.POWER_OFF + + self.assertRaises(exception.ParentNodeLocked, + conductor_utils.node_power_action, + task, states.POWER_ON) + node.refresh() + parent.refresh() + self.assertEqual(2, get_power_mock.call_count) + self.assertEqual(states.POWER_OFF, parent['power_state']) + self.assertEqual(states.POWER_OFF, node['power_state']) + self.assertIsNone(parent['target_power_state']) + self.assertIsNone(parent['last_error']) + self.assertIn('is presently locked', node['last_error']) + self.assertIsNone(node['target_power_state']) + + @mock.patch.object(fake.FakePower, 'get_power_state', autospec=True) + def test_node_power_action_power_off_exception_if_child_locked( + self, + get_power_mock): + """Test node_power_action to turns on a parent node""" + parent = obj_utils.create_test_node( + self.context, + uuid=uuidutils.generate_uuid(), + driver='fake-hardware', + power_state=states.POWER_ON) + + node = obj_utils.create_test_node( + self.context, + uuid=uuidutils.generate_uuid(), + driver='fake-hardware', + power_state=states.POWER_ON, + last_error='failed before', + parent_node=parent.uuid) + + locked_node = obj_utils.create_test_node( + self.context, + uuid=uuidutils.generate_uuid(), + driver='fake-hardware', + power_state=states.POWER_ON, + last_error='failed before', + parent_node=parent.uuid, + reservation='foo') + + task = task_manager.TaskManager(self.context, parent.uuid) + + get_power_mock.side_effect = states.POWER_ON + + self.assertRaises(exception.ChildNodeLocked, + conductor_utils.node_power_action, + task, states.POWER_OFF) + # Since we launched the task this way, we need to explicitly + # release it. + task.release_resources() + node.refresh() + parent.refresh() + self.assertEqual(3, get_power_mock.call_count) + self.assertEqual(states.POWER_ON, parent['power_state']) + self.assertEqual(states.POWER_ON, locked_node['power_state']) + self.assertEqual(states.POWER_ON, node['power_state']) + self.assertIsNone(parent['target_power_state']) + self.assertEqual('failed before', node['last_error']) + self.assertIn('is presently locked', parent['last_error']) + self.assertIsNone(node['target_power_state']) + self.assertIsNone(parent['target_power_state']) + self.assertIsNone(locked_node['target_power_state']) + self.assertIsNone(node['reservation']) + self.assertIsNone(parent['reservation']) + def test__calculate_target_state(self): for new_state in (states.POWER_ON, states.REBOOT, states.SOFT_REBOOT): self.assertEqual( diff --git a/releasenotes/notes/handle-power-ops-for-child-nodes-67a11f1900ce137a.yaml b/releasenotes/notes/handle-power-ops-for-child-nodes-67a11f1900ce137a.yaml new file mode 100644 index 0000000000..6ef2921a63 --- /dev/null +++ b/releasenotes/notes/handle-power-ops-for-child-nodes-67a11f1900ce137a.yaml @@ -0,0 +1,11 @@ +--- +fixes: + - | + Fixes the power handling flow as it relates to ``child nodes``, i.e. + bare metal nodes which have a ``parent_node`` set, such that power is + turned off on those nodes when the parent node is powered off, and that + power is turned on for the parent node when the child node is explicitly + requested to be in a ``power on`` state. This does not apply if the child + node device has a dedicated power supply, as indicated through a + ``driver_info`` parameter named ``has_dedicated_power_supply`` which + can be set to a value of "true".