From 23745d97fe4e154c79389a951b4fd47d49fed494 Mon Sep 17 00:00:00 2001 From: Dmitry Tantsur Date: Fri, 8 Dec 2023 18:04:02 +0100 Subject: [PATCH] Fix two severe errors in the firmware caching code First, it tries to create components even if the current version is not known and fails with a database constraint error (because the initial version cannot be NULL). Can be reproduced with sushy-tools before https://opendev.org/openstack/sushy-tools/commit/37f118237a5e7788301968ad123f19f2600c1418 Second, unexpected exceptions are not handled in the caching code, so any of them will cause the node to get stuck in cleaning forever. On top of that, the caching code is missing a metrics decorator. This change does not update any unit tests because none currently exist. Change-Id: Iaa242ca6aa6138fcdaaf63b763708e2f1e559cb0 --- ironic/conductor/utils.py | 7 ++++++- ironic/drivers/modules/redfish/firmware.py | 19 +++++++++++-------- .../notes/firmware-fail-c6f6c70220373033.yaml | 8 ++++++++ 3 files changed, 25 insertions(+), 9 deletions(-) create mode 100644 releasenotes/notes/firmware-fail-c6f6c70220373033.yaml diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index cb30fdf549..82efd76a79 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -1449,8 +1449,8 @@ def node_cache_bios_settings(task, node): except exception.UnsupportedDriverExtension: LOG.warning('BIOS settings are not supported for node %s, ' 'skipping', node.uuid) - # TODO(zshi) remove this check when classic drivers are removed except Exception: + # NOTE(dtantsur): the caller expects this function to never fail msg = (_('Caching of bios settings failed on node %(node)s.') % {'node': node.uuid}) LOG.exception(msg) @@ -1474,6 +1474,7 @@ def node_cache_vendor(task): except exception.UnsupportedDriverExtension: return except Exception as exc: + # NOTE(dtantsur): the caller expects this function to never fail LOG.warning('Unexpected exception when trying to detect vendor ' 'for node %(node)s. %(class)s: %(exc)s', {'node': task.node.uuid, @@ -1840,6 +1841,10 @@ def node_cache_firmware_components(task): except exception.UnsupportedDriverExtension: LOG.warning('Firmware Components are not supported for node %s, ' 'skipping', task.node.uuid) + except Exception: + # NOTE(dtantsur): the caller expects this function to never fail + LOG.exception('Caching of firmware components failed on node %s', + task.node.uuid) def run_node_action(task, call, error_msg, success_msg=None, **kwargs): diff --git a/ironic/drivers/modules/redfish/firmware.py b/ironic/drivers/modules/redfish/firmware.py index 207c61aadf..c325213a6d 100644 --- a/ironic/drivers/modules/redfish/firmware.py +++ b/ironic/drivers/modules/redfish/firmware.py @@ -71,6 +71,7 @@ class RedfishFirmware(base.FirmwareInterface): """ redfish_utils.parse_driver_info(task.node) + @METRICS.timer('RedfishFirmware.cache_firmware_components') def cache_firmware_components(self, task): """Store or update Firmware Components on the given node. @@ -90,25 +91,27 @@ class RedfishFirmware(base.FirmwareInterface): system = redfish_utils.get_system(task.node) - bios_fw = {'component': 'bios', - 'current_version': system.bios_version} - settings.append(bios_fw) + if system.bios_version: + bios_fw = {'component': 'bios', + 'current_version': system.bios_version} + settings.append(bios_fw) # NOTE(iurygregory): normally we only relay on the System to # perform actions, but to retrieve the BMC Firmware we need to # access the Manager. try: manager = redfish_utils.get_manager(task.node, system) - bmc_fw = {'component': 'bmc', - 'current_version': manager.firmware_version} - settings.append(bmc_fw) + if manager.firmware_version: + bmc_fw = {'component': 'bmc', + 'current_version': manager.firmware_version} + settings.append(bmc_fw) except exception.RedfishError: LOG.warning('No manager available to retrieve Firmware ' 'from the bmc of node %s', task.node.uuid) if not settings: - error_msg = (_('Cannot retrieve firmware for node %s.') - % task.node.uuid) + error_msg = (_('Cannot retrieve firmware for node %s: no ' + 'supported components') % task.node.uuid) LOG.error(error_msg) raise exception.UnsupportedDriverExtension(error_msg) diff --git a/releasenotes/notes/firmware-fail-c6f6c70220373033.yaml b/releasenotes/notes/firmware-fail-c6f6c70220373033.yaml new file mode 100644 index 0000000000..98663df1f2 --- /dev/null +++ b/releasenotes/notes/firmware-fail-c6f6c70220373033.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + Nodes no longer get stuck in cleaning when the firmware components caching + code raises an unexpected exception. + - | + Prevents a database constraints error on caching firmware components + when a supported component does not have the current version.