Add cleanwait timeout cleanup process

Previously, if a node in a cleaning state timed out, the timeout
process would not purge certain items from the node's configuration
which resulted in a short circuiting of the logic cleaning being
retried. This was a result of the node clean_step configuration
not being purged upon a timeout occuring.

This change adds a wrapper method around the cleaning failure
error handler to allow the _fail_if_in_state method to call
the error handler, since error handler syntax is not uniform
and the _fail_if_in_state cannot pass arguments.

It also changes the cleaning error handler to permit the error
handler to delete the node clean_step, and cleaning related
driver_internal_info configuration from a node in the event
the node in in CLEANFAIL state.

Change-Id: I9ee5c0b385648c9b7e1d330d5d1af9b2c486a436
Closes-Bug: #1590146
This commit is contained in:
Julia Kreger 2016-06-08 18:39:09 -04:00
parent a155f30c88
commit 42e2685395
5 changed files with 69 additions and 7 deletions

View File

@ -1356,13 +1356,10 @@ class ConductorManager(base_manager.BaseConductorManager):
'provision_state': states.CLEANWAIT, 'provision_state': states.CLEANWAIT,
'maintenance': False, 'maintenance': False,
'provisioned_before': callback_timeout} 'provisioned_before': callback_timeout}
last_error = _("Timeout reached while cleaning the node. Please "
"check if the ramdisk responsible for the cleaning is "
"running on the node.")
self._fail_if_in_state(context, filters, states.CLEANWAIT, self._fail_if_in_state(context, filters, states.CLEANWAIT,
'provision_updated_at', 'provision_updated_at',
last_error=last_error, keep_target_state=True,
keep_target_state=True) callback_method=utils.cleanup_cleanwait_timeout)
@periodics.periodic(spacing=CONF.conductor.sync_local_state_interval) @periodics.periodic(spacing=CONF.conductor.sync_local_state_interval)
def _sync_local_state(self, context): def _sync_local_state(self, context):

View File

@ -202,11 +202,27 @@ def provisioning_error_handler(e, node, provision_state,
'tgt_prov_state': target_provision_state}) 'tgt_prov_state': target_provision_state})
def cleanup_cleanwait_timeout(task):
"""Cleanup a cleaning task after timeout.
:param task: a TaskManager instance.
"""
last_error = (_("Timeout reached while cleaning the node. Please "
"check if the ramdisk responsible for the cleaning is "
"running on the node. Failed on step %(step)s.") %
{'step': task.node.clean_step})
cleaning_error_handler(task, msg=last_error,
set_fail_state=True)
def cleaning_error_handler(task, msg, tear_down_cleaning=True, def cleaning_error_handler(task, msg, tear_down_cleaning=True,
set_fail_state=True): set_fail_state=True):
"""Put a failed node in CLEANFAIL and maintenance.""" """Put a failed node in CLEANFAIL and maintenance."""
node = task.node node = task.node
if node.provision_state in (states.CLEANING, states.CLEANWAIT): if node.provision_state in (
states.CLEANING,
states.CLEANWAIT,
states.CLEANFAIL):
# Clear clean step, msg should already include current step # Clear clean step, msg should already include current step
node.clean_step = {} node.clean_step = {}
info = node.driver_internal_info info = node.driver_internal_info

View File

@ -1100,7 +1100,13 @@ class DoNodeDeployTearDownTestCase(mgr_utils.ServiceSetUpMixin,
self.context, driver='fake', self.context, driver='fake',
provision_state=states.CLEANWAIT, provision_state=states.CLEANWAIT,
target_provision_state=tgt_prov_state, target_provision_state=tgt_prov_state,
provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0)) provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0),
clean_step={
'interface': 'deploy',
'step': 'erase_devices'},
driver_internal_info={
'cleaning_reboot': manual,
'clean_step_index': 0})
self.service._check_cleanwait_timeouts(self.context) self.service._check_cleanwait_timeouts(self.context)
self._stop_service() self._stop_service()
@ -1108,6 +1114,11 @@ class DoNodeDeployTearDownTestCase(mgr_utils.ServiceSetUpMixin,
self.assertEqual(states.CLEANFAIL, node.provision_state) self.assertEqual(states.CLEANFAIL, node.provision_state)
self.assertEqual(tgt_prov_state, node.target_provision_state) self.assertEqual(tgt_prov_state, node.target_provision_state)
self.assertIsNotNone(node.last_error) self.assertIsNotNone(node.last_error)
# Test that cleaning parameters have been purged in order
# to prevent looping of the cleaning sequence
self.assertEqual({}, node.clean_step)
self.assertNotIn('clean_step_index', node.driver_internal_info)
self.assertNotIn('cleaning_reboot', node.driver_internal_info)
def test__check_cleanwait_timeouts_automated_clean(self): def test__check_cleanwait_timeouts_automated_clean(self):
self._check_cleanwait_timeouts() self._check_cleanwait_timeouts()

View File

@ -572,6 +572,37 @@ class ErrorHandlersTestCase(tests_base.TestCase):
self.assertFalse(self.node.save.called) self.assertFalse(self.node.save.called)
self.assertFalse(log_mock.warning.called) self.assertFalse(log_mock.warning.called)
@mock.patch.object(conductor_utils, 'cleaning_error_handler')
def test_cleanup_cleanwait_timeout_handler_call(self, mock_error_handler):
self.node.clean_step = {}
conductor_utils.cleanup_cleanwait_timeout(self.task)
mock_error_handler.assert_called_once_with(
self.task,
msg="Timeout reached while cleaning the node. Please "
"check if the ramdisk responsible for the cleaning is "
"running on the node. Failed on step {}.",
set_fail_state=True)
def test_cleanup_cleanwait_timeout(self):
self.node.provision_state = states.CLEANFAIL
target = 'baz'
self.node.target_provision_state = target
self.node.driver_internal_info = {}
self.node.clean_step = {'key': 'val'}
clean_error = ("Timeout reached while cleaning the node. Please "
"check if the ramdisk responsible for the cleaning is "
"running on the node. Failed on step {'key': 'val'}.")
self.node.driver_internal_info = {
'cleaning_reboot': True,
'clean_step_index': 0}
conductor_utils.cleanup_cleanwait_timeout(self.task)
self.assertEqual({}, self.node.clean_step)
self.assertNotIn('clean_step_index', self.node.driver_internal_info)
self.task.process_event.assert_called_once()
self.assertTrue(self.node.maintenance)
self.assertEqual(clean_error, self.node.maintenance_reason)
def test_cleaning_error_handler(self): def test_cleaning_error_handler(self):
self.node.provision_state = states.CLEANING self.node.provision_state = states.CLEANING
target = 'baz' target = 'baz'

View File

@ -0,0 +1,7 @@
---
fixes:
- A bug has been corrected where a node's current clean_step
was not purged upon that node timing out from a CLEANWAIT state.
Previously, this bug would prevent a user from retrying cleaning
operations. For more information,
see https://bugs.launchpad.net/ironic/+bug/1590146.