Add cleanwait timeout cleanup process
Previously, if a node in a cleaning state timed out, the timeout process would not purge certain items from the node's configuration which resulted in a short circuiting of the logic cleaning being retried. This was a result of the node clean_step configuration not being purged upon a timeout occuring. This change adds a wrapper method around the cleaning failure error handler to allow the _fail_if_in_state method to call the error handler, since error handler syntax is not uniform and the _fail_if_in_state cannot pass arguments. It also changes the cleaning error handler to permit the error handler to delete the node clean_step, and cleaning related driver_internal_info configuration from a node in the event the node in in CLEANFAIL state. Change-Id: I9ee5c0b385648c9b7e1d330d5d1af9b2c486a436 Closes-Bug: #1590146
This commit is contained in:
parent
a155f30c88
commit
42e2685395
@ -1356,13 +1356,10 @@ class ConductorManager(base_manager.BaseConductorManager):
|
|||||||
'provision_state': states.CLEANWAIT,
|
'provision_state': states.CLEANWAIT,
|
||||||
'maintenance': False,
|
'maintenance': False,
|
||||||
'provisioned_before': callback_timeout}
|
'provisioned_before': callback_timeout}
|
||||||
last_error = _("Timeout reached while cleaning the node. Please "
|
|
||||||
"check if the ramdisk responsible for the cleaning is "
|
|
||||||
"running on the node.")
|
|
||||||
self._fail_if_in_state(context, filters, states.CLEANWAIT,
|
self._fail_if_in_state(context, filters, states.CLEANWAIT,
|
||||||
'provision_updated_at',
|
'provision_updated_at',
|
||||||
last_error=last_error,
|
keep_target_state=True,
|
||||||
keep_target_state=True)
|
callback_method=utils.cleanup_cleanwait_timeout)
|
||||||
|
|
||||||
@periodics.periodic(spacing=CONF.conductor.sync_local_state_interval)
|
@periodics.periodic(spacing=CONF.conductor.sync_local_state_interval)
|
||||||
def _sync_local_state(self, context):
|
def _sync_local_state(self, context):
|
||||||
|
@ -202,11 +202,27 @@ def provisioning_error_handler(e, node, provision_state,
|
|||||||
'tgt_prov_state': target_provision_state})
|
'tgt_prov_state': target_provision_state})
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_cleanwait_timeout(task):
|
||||||
|
"""Cleanup a cleaning task after timeout.
|
||||||
|
|
||||||
|
:param task: a TaskManager instance.
|
||||||
|
"""
|
||||||
|
last_error = (_("Timeout reached while cleaning the node. Please "
|
||||||
|
"check if the ramdisk responsible for the cleaning is "
|
||||||
|
"running on the node. Failed on step %(step)s.") %
|
||||||
|
{'step': task.node.clean_step})
|
||||||
|
cleaning_error_handler(task, msg=last_error,
|
||||||
|
set_fail_state=True)
|
||||||
|
|
||||||
|
|
||||||
def cleaning_error_handler(task, msg, tear_down_cleaning=True,
|
def cleaning_error_handler(task, msg, tear_down_cleaning=True,
|
||||||
set_fail_state=True):
|
set_fail_state=True):
|
||||||
"""Put a failed node in CLEANFAIL and maintenance."""
|
"""Put a failed node in CLEANFAIL and maintenance."""
|
||||||
node = task.node
|
node = task.node
|
||||||
if node.provision_state in (states.CLEANING, states.CLEANWAIT):
|
if node.provision_state in (
|
||||||
|
states.CLEANING,
|
||||||
|
states.CLEANWAIT,
|
||||||
|
states.CLEANFAIL):
|
||||||
# Clear clean step, msg should already include current step
|
# Clear clean step, msg should already include current step
|
||||||
node.clean_step = {}
|
node.clean_step = {}
|
||||||
info = node.driver_internal_info
|
info = node.driver_internal_info
|
||||||
|
@ -1100,7 +1100,13 @@ class DoNodeDeployTearDownTestCase(mgr_utils.ServiceSetUpMixin,
|
|||||||
self.context, driver='fake',
|
self.context, driver='fake',
|
||||||
provision_state=states.CLEANWAIT,
|
provision_state=states.CLEANWAIT,
|
||||||
target_provision_state=tgt_prov_state,
|
target_provision_state=tgt_prov_state,
|
||||||
provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0))
|
provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0),
|
||||||
|
clean_step={
|
||||||
|
'interface': 'deploy',
|
||||||
|
'step': 'erase_devices'},
|
||||||
|
driver_internal_info={
|
||||||
|
'cleaning_reboot': manual,
|
||||||
|
'clean_step_index': 0})
|
||||||
|
|
||||||
self.service._check_cleanwait_timeouts(self.context)
|
self.service._check_cleanwait_timeouts(self.context)
|
||||||
self._stop_service()
|
self._stop_service()
|
||||||
@ -1108,6 +1114,11 @@ class DoNodeDeployTearDownTestCase(mgr_utils.ServiceSetUpMixin,
|
|||||||
self.assertEqual(states.CLEANFAIL, node.provision_state)
|
self.assertEqual(states.CLEANFAIL, node.provision_state)
|
||||||
self.assertEqual(tgt_prov_state, node.target_provision_state)
|
self.assertEqual(tgt_prov_state, node.target_provision_state)
|
||||||
self.assertIsNotNone(node.last_error)
|
self.assertIsNotNone(node.last_error)
|
||||||
|
# Test that cleaning parameters have been purged in order
|
||||||
|
# to prevent looping of the cleaning sequence
|
||||||
|
self.assertEqual({}, node.clean_step)
|
||||||
|
self.assertNotIn('clean_step_index', node.driver_internal_info)
|
||||||
|
self.assertNotIn('cleaning_reboot', node.driver_internal_info)
|
||||||
|
|
||||||
def test__check_cleanwait_timeouts_automated_clean(self):
|
def test__check_cleanwait_timeouts_automated_clean(self):
|
||||||
self._check_cleanwait_timeouts()
|
self._check_cleanwait_timeouts()
|
||||||
|
@ -572,6 +572,37 @@ class ErrorHandlersTestCase(tests_base.TestCase):
|
|||||||
self.assertFalse(self.node.save.called)
|
self.assertFalse(self.node.save.called)
|
||||||
self.assertFalse(log_mock.warning.called)
|
self.assertFalse(log_mock.warning.called)
|
||||||
|
|
||||||
|
@mock.patch.object(conductor_utils, 'cleaning_error_handler')
|
||||||
|
def test_cleanup_cleanwait_timeout_handler_call(self, mock_error_handler):
|
||||||
|
self.node.clean_step = {}
|
||||||
|
conductor_utils.cleanup_cleanwait_timeout(self.task)
|
||||||
|
|
||||||
|
mock_error_handler.assert_called_once_with(
|
||||||
|
self.task,
|
||||||
|
msg="Timeout reached while cleaning the node. Please "
|
||||||
|
"check if the ramdisk responsible for the cleaning is "
|
||||||
|
"running on the node. Failed on step {}.",
|
||||||
|
set_fail_state=True)
|
||||||
|
|
||||||
|
def test_cleanup_cleanwait_timeout(self):
|
||||||
|
self.node.provision_state = states.CLEANFAIL
|
||||||
|
target = 'baz'
|
||||||
|
self.node.target_provision_state = target
|
||||||
|
self.node.driver_internal_info = {}
|
||||||
|
self.node.clean_step = {'key': 'val'}
|
||||||
|
clean_error = ("Timeout reached while cleaning the node. Please "
|
||||||
|
"check if the ramdisk responsible for the cleaning is "
|
||||||
|
"running on the node. Failed on step {'key': 'val'}.")
|
||||||
|
self.node.driver_internal_info = {
|
||||||
|
'cleaning_reboot': True,
|
||||||
|
'clean_step_index': 0}
|
||||||
|
conductor_utils.cleanup_cleanwait_timeout(self.task)
|
||||||
|
self.assertEqual({}, self.node.clean_step)
|
||||||
|
self.assertNotIn('clean_step_index', self.node.driver_internal_info)
|
||||||
|
self.task.process_event.assert_called_once()
|
||||||
|
self.assertTrue(self.node.maintenance)
|
||||||
|
self.assertEqual(clean_error, self.node.maintenance_reason)
|
||||||
|
|
||||||
def test_cleaning_error_handler(self):
|
def test_cleaning_error_handler(self):
|
||||||
self.node.provision_state = states.CLEANING
|
self.node.provision_state = states.CLEANING
|
||||||
target = 'baz'
|
target = 'baz'
|
||||||
|
@ -0,0 +1,7 @@
|
|||||||
|
---
|
||||||
|
fixes:
|
||||||
|
- A bug has been corrected where a node's current clean_step
|
||||||
|
was not purged upon that node timing out from a CLEANWAIT state.
|
||||||
|
Previously, this bug would prevent a user from retrying cleaning
|
||||||
|
operations. For more information,
|
||||||
|
see https://bugs.launchpad.net/ironic/+bug/1590146.
|
Loading…
Reference in New Issue
Block a user