From 23951f4b44e64513da047c474f0a49408ddf9208 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aija=20Jaunt=C4=93va?= Date: Fri, 2 Oct 2020 04:57:42 -0400 Subject: [PATCH] Fix idrac-wsman RAID step async error handling Instead of using process_event('fail') use error_handlers, otherwise in case of failure node gets stuck and fails because of timeout, instead of failing earlier due to step failure. Story: 2008307 Task: 41194 Change-Id: Ieec0173f57367587985d2baad77205bb83e8b69a --- ironic/drivers/modules/drac/raid.py | 19 +++++++----- .../modules/drac/test_periodic_task.py | 30 +++++++++++++++---- ...-step-error-handling-f44e2001ac018d12.yaml | 8 +++++ 3 files changed, 44 insertions(+), 13 deletions(-) create mode 100644 releasenotes/notes/fix-wsman-raid-async-step-error-handling-f44e2001ac018d12.yaml diff --git a/ironic/drivers/modules/drac/raid.py b/ironic/drivers/modules/drac/raid.py index 8acf9188df..6de316799c 100644 --- a/ironic/drivers/modules/drac/raid.py +++ b/ironic/drivers/modules/drac/raid.py @@ -1491,13 +1491,18 @@ class DracWSManRAID(base.RAIDInterface): node.save() def _set_failed(self, task, config_job): - LOG.error("RAID configuration job failed for node %(node)s. " - "Failed config job: %(config_job_id)s. " - "Message: '%(message)s'.", - {'node': task.node.uuid, 'config_job_id': config_job.id, - 'message': config_job.message}) - task.node.last_error = config_job.message - task.process_event('fail') + error_msg = (_("Failed config job: %(config_job_id)s. " + "Message: '%(message)s'.") % + {'config_job_id': config_job.id, + 'message': config_job.message}) + log_msg = (_("RAID configuration job failed for node %(node)s. " + "%(error)s") % + {'node': task.node.uuid, 'error': error_msg}) + if task.node.clean_step: + LOG.error(log_msg) + manager_utils.cleaning_error_handler(task, error_msg) + else: + manager_utils.deploying_error_handler(task, log_msg, error_msg) def _resume(self, task): raid_common.update_raid_info( diff --git a/ironic/tests/unit/drivers/modules/drac/test_periodic_task.py b/ironic/tests/unit/drivers/modules/drac/test_periodic_task.py index ba5b7ae5e6..ec7f4e27c1 100644 --- a/ironic/tests/unit/drivers/modules/drac/test_periodic_task.py +++ b/ironic/tests/unit/drivers/modules/drac/test_periodic_task.py @@ -203,9 +203,11 @@ class DracPeriodicTaskTestCase(db_base.DbTestCase): self._test__check_node_raid_jobs_with_completed_job( mock_notify_conductor_resume) + @mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True) @mock.patch.object(drac_common, 'get_drac_client', spec_set=True, autospec=True) - def test__check_node_raid_jobs_with_failed_job(self, mock_get_drac_client): + def test__check_node_raid_jobs_with_failed_job( + self, mock_get_drac_client, mock_cleaning_error_handler): # mock node.driver_internal_info and node.clean_step driver_internal_info = {'raid_config_job_ids': ['42']} self.node.driver_internal_info = driver_internal_info @@ -232,15 +234,18 @@ class DracPeriodicTaskTestCase(db_base.DbTestCase): self.assertEqual([], self.node.driver_internal_info['raid_config_job_ids']) self.assertEqual({}, self.node.raid_config) - task.process_event.assert_called_once_with('fail') + mock_cleaning_error_handler.assert_called_once_with(task, mock.ANY) + @mock.patch.object(manager_utils, 'deploying_error_handler', autospec=True) + @mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True) @mock.patch.object(drac_common, 'get_drac_client', spec_set=True, autospec=True) @mock.patch.object(drac_raid.DracRAID, 'get_logical_disks', spec_set=True, autospec=True) def _test__check_node_raid_jobs_with_completed_job_already_failed( self, mock_notify_conductor_resume, - mock_get_logical_disks, mock_get_drac_client): + mock_get_logical_disks, mock_get_drac_client, + mock_cleaning_error_handler, mock_deploying_error_handler): expected_logical_disk = {'size_gb': 558, 'raid_level': '1', 'name': 'disk 0'} @@ -271,7 +276,12 @@ class DracPeriodicTaskTestCase(db_base.DbTestCase): self.assertNotIn('raid_config_job_failure', self.node.driver_internal_info) self.assertNotIn('logical_disks', self.node.raid_config) - task.process_event.assert_called_once_with('fail') + if self.node.clean_step: + mock_cleaning_error_handler.assert_called_once_with(task, mock.ANY) + else: + mock_deploying_error_handler.assert_called_once_with(task, + mock.ANY, + mock.ANY) self.assertFalse(mock_notify_conductor_resume.called) @mock.patch.object(manager_utils, 'notify_conductor_resume_clean', @@ -346,13 +356,16 @@ class DracPeriodicTaskTestCase(db_base.DbTestCase): self._test__check_node_raid_jobs_with_multiple_jobs_completed( mock_notify_conductor_resume) + @mock.patch.object(manager_utils, 'deploying_error_handler', autospec=True) + @mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True) @mock.patch.object(drac_common, 'get_drac_client', spec_set=True, autospec=True) @mock.patch.object(drac_raid.DracRAID, 'get_logical_disks', spec_set=True, autospec=True) def _test__check_node_raid_jobs_with_multiple_jobs_failed( self, mock_notify_conductor_resume, - mock_get_logical_disks, mock_get_drac_client): + mock_get_logical_disks, mock_get_drac_client, + mock_cleaning_error_handler, mock_deploying_error_handler): expected_logical_disk = {'size_gb': 558, 'raid_level': '1', 'name': 'disk 0'} @@ -387,7 +400,12 @@ class DracPeriodicTaskTestCase(db_base.DbTestCase): self.assertNotIn('raid_config_job_failure', self.node.driver_internal_info) self.assertNotIn('logical_disks', self.node.raid_config) - task.process_event.assert_called_once_with('fail') + if self.node.clean_step: + mock_cleaning_error_handler.assert_called_once_with(task, mock.ANY) + else: + mock_deploying_error_handler.assert_called_once_with(task, + mock.ANY, + mock.ANY) self.assertFalse(mock_notify_conductor_resume.called) @mock.patch.object(manager_utils, 'notify_conductor_resume_clean', diff --git a/releasenotes/notes/fix-wsman-raid-async-step-error-handling-f44e2001ac018d12.yaml b/releasenotes/notes/fix-wsman-raid-async-step-error-handling-f44e2001ac018d12.yaml new file mode 100644 index 0000000000..b76a4c7858 --- /dev/null +++ b/releasenotes/notes/fix-wsman-raid-async-step-error-handling-f44e2001ac018d12.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + Fixes ``idrac-wsman`` RAID ``create_configuration`` clean step, + ``apply_configuration`` deploy step and ``delete_configuration`` clean and + deploy step to fail correctly in case of error when checking completed + jobs. Before the fix when RAID job failed, then node cleaning or deploying + failed with timeout instead of actual error in clean or deploy step.