Merge "Fix for failure in cleaning"
This commit is contained in:
commit
90cccaa520
@ -791,3 +791,7 @@ class BIOSSettingListNotFound(NotFound):
|
||||
|
||||
class DatabaseVersionTooOld(IronicException):
|
||||
_msg_fmt = _("Database version is too old")
|
||||
|
||||
|
||||
class AgentConnectionFailed(IronicException):
|
||||
_msg_fmt = _("Connection to agent failed: %(reason)s")
|
||||
|
@ -1311,6 +1311,19 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
try:
|
||||
result = interface.execute_clean_step(task, step)
|
||||
except Exception as e:
|
||||
if isinstance(e, exception.AgentConnectionFailed):
|
||||
if task.node.driver_internal_info.get('cleaning_reboot'):
|
||||
LOG.info('Agent is not yet running on node %(node)s '
|
||||
'after cleaning reboot, waiting for agent to '
|
||||
'come up to run next clean step %(step)s.',
|
||||
{'node': node.uuid, 'step': step})
|
||||
driver_internal_info['skip_current_clean_step'] = False
|
||||
node.driver_internal_info = driver_internal_info
|
||||
target_state = (states.MANAGEABLE if manual_clean
|
||||
else None)
|
||||
task.process_event('wait', target_state=target_state)
|
||||
return
|
||||
|
||||
msg = (_('Node %(node)s failed step %(step)s: '
|
||||
'%(exc)s') %
|
||||
{'node': node.uuid, 'exc': e,
|
||||
@ -1345,6 +1358,7 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
driver_internal_info = node.driver_internal_info
|
||||
driver_internal_info['clean_steps'] = None
|
||||
driver_internal_info.pop('clean_step_index', None)
|
||||
driver_internal_info.pop('cleaning_reboot', None)
|
||||
node.driver_internal_info = driver_internal_info
|
||||
node.save()
|
||||
try:
|
||||
|
@ -466,6 +466,7 @@ class AgentDeployMixin(HeartbeatMixin):
|
||||
info = task.node.driver_internal_info
|
||||
info.pop('cleaning_reboot', None)
|
||||
task.node.driver_internal_info = info
|
||||
task.node.save()
|
||||
manager_utils.notify_conductor_resume_clean(task)
|
||||
return
|
||||
else:
|
||||
|
@ -81,6 +81,12 @@ class AgentClient(object):
|
||||
|
||||
try:
|
||||
response = self.session.post(url, params=request_params, data=body)
|
||||
except requests.ConnectionError as e:
|
||||
msg = (_('Failed to invoke agent command %(method)s for node '
|
||||
'%(node)s. Error: %(error)s') %
|
||||
{'method': method, 'node': node.uuid, 'error': e})
|
||||
LOG.error(msg)
|
||||
raise exception.AgentConnectionFailed(reason=msg)
|
||||
except requests.RequestException as e:
|
||||
msg = (_('Error invoking agent command %(method)s for node '
|
||||
'%(node)s. Error: %(error)s') %
|
||||
|
@ -896,4 +896,8 @@ class DracRAID(base.RAIDInterface):
|
||||
def _resume_cleaning(self, task):
|
||||
raid_common.update_raid_info(
|
||||
task.node, self.get_logical_disks(task))
|
||||
driver_internal_info = task.node.driver_internal_info
|
||||
driver_internal_info['cleaning_reboot'] = True
|
||||
task.node.driver_internal_info = driver_internal_info
|
||||
task.node.save()
|
||||
manager_utils.notify_conductor_resume_clean(task)
|
||||
|
@ -3665,6 +3665,114 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
def test__do_next_clean_step_manual_execute_fail(self):
|
||||
self._do_next_clean_step_execute_fail(manual=True)
|
||||
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step',
|
||||
autospec=True)
|
||||
def test_do_next_clean_step_oob_reboot(self, mock_execute):
|
||||
# When a clean step fails, go to CLEANWAIT
|
||||
tgt_prov_state = states.MANAGEABLE
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake-hardware',
|
||||
provision_state=states.CLEANING,
|
||||
target_provision_state=tgt_prov_state,
|
||||
last_error=None,
|
||||
driver_internal_info={'clean_steps': self.clean_steps,
|
||||
'clean_step_index': None,
|
||||
'cleaning_reboot': True},
|
||||
clean_step={})
|
||||
mock_execute.side_effect = exception.AgentConnectionFailed(
|
||||
reason='failed')
|
||||
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_next_clean_step(task, 0)
|
||||
|
||||
self._stop_service()
|
||||
node.refresh()
|
||||
|
||||
# Make sure we go to CLEANWAIT
|
||||
self.assertEqual(states.CLEANWAIT, node.provision_state)
|
||||
self.assertEqual(tgt_prov_state, node.target_provision_state)
|
||||
self.assertEqual(self.clean_steps[0], node.clean_step)
|
||||
self.assertEqual(0, node.driver_internal_info['clean_step_index'])
|
||||
self.assertFalse(node.driver_internal_info['skip_current_clean_step'])
|
||||
mock_execute.assert_called_once_with(
|
||||
mock.ANY, mock.ANY, self.clean_steps[0])
|
||||
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step',
|
||||
autospec=True)
|
||||
def test_do_next_clean_step_oob_reboot_last_step(self, mock_execute):
|
||||
# Resume where last_step is the last cleaning step
|
||||
tgt_prov_state = states.MANAGEABLE
|
||||
info = {'clean_steps': self.clean_steps,
|
||||
'cleaning_reboot': True,
|
||||
'clean_step_index': len(self.clean_steps) - 1}
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake-hardware',
|
||||
provision_state=states.CLEANING,
|
||||
target_provision_state=tgt_prov_state,
|
||||
last_error=None,
|
||||
driver_internal_info=info,
|
||||
clean_step=self.clean_steps[-1])
|
||||
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_next_clean_step(task, None)
|
||||
|
||||
self._stop_service()
|
||||
node.refresh()
|
||||
|
||||
# Cleaning should be complete without calling additional steps
|
||||
self.assertEqual(tgt_prov_state, node.provision_state)
|
||||
self.assertEqual(states.NOSTATE, node.target_provision_state)
|
||||
self.assertEqual({}, node.clean_step)
|
||||
self.assertNotIn('clean_step_index', node.driver_internal_info)
|
||||
self.assertNotIn('cleaning_reboot', node.driver_internal_info)
|
||||
self.assertIsNone(node.driver_internal_info['clean_steps'])
|
||||
self.assertFalse(mock_execute.called)
|
||||
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step',
|
||||
autospec=True)
|
||||
@mock.patch.object(fake.FakeDeploy, 'tear_down_cleaning', autospec=True)
|
||||
def test_do_next_clean_step_oob_reboot_fail(self, tear_mock,
|
||||
mock_execute):
|
||||
# When a clean step fails with no reboot requested go to CLEANFAIL
|
||||
tgt_prov_state = states.MANAGEABLE
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake-hardware',
|
||||
provision_state=states.CLEANING,
|
||||
target_provision_state=tgt_prov_state,
|
||||
last_error=None,
|
||||
driver_internal_info={'clean_steps': self.clean_steps,
|
||||
'clean_step_index': None},
|
||||
clean_step={})
|
||||
mock_execute.side_effect = exception.AgentConnectionFailed(
|
||||
reason='failed')
|
||||
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_next_clean_step(task, 0)
|
||||
tear_mock.assert_called_once_with(task.driver.deploy, task)
|
||||
|
||||
self._stop_service()
|
||||
node.refresh()
|
||||
|
||||
# Make sure we go to CLEANFAIL, clear clean_steps
|
||||
self.assertEqual(states.CLEANFAIL, node.provision_state)
|
||||
self.assertEqual(tgt_prov_state, node.target_provision_state)
|
||||
self.assertEqual({}, node.clean_step)
|
||||
self.assertNotIn('clean_step_index', node.driver_internal_info)
|
||||
self.assertNotIn('skip_current_clean_step', node.driver_internal_info)
|
||||
self.assertIsNotNone(node.last_error)
|
||||
self.assertTrue(node.maintenance)
|
||||
mock_execute.assert_called_once_with(
|
||||
mock.ANY, mock.ANY, self.clean_steps[0])
|
||||
|
||||
@mock.patch.object(manager, 'LOG', autospec=True)
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step',
|
||||
autospec=True)
|
||||
|
@ -133,6 +133,24 @@ class TestAgentClient(base.TestCase):
|
||||
{'method': method, 'node': self.node.uuid,
|
||||
'error': error}, str(e))
|
||||
|
||||
def test__command_fail_connect(self):
|
||||
error = 'Boom'
|
||||
self.client.session.post.side_effect = requests.ConnectionError(error)
|
||||
method = 'foo.bar'
|
||||
params = {}
|
||||
|
||||
self.client._get_command_url(self.node)
|
||||
self.client._get_command_body(method, params)
|
||||
|
||||
e = self.assertRaises(exception.AgentConnectionFailed,
|
||||
self.client._command,
|
||||
self.node, method, params)
|
||||
self.assertEqual('Connection to agent failed: Failed to invoke '
|
||||
'agent command %(method)s for node %(node)s. '
|
||||
'Error: %(error)s' %
|
||||
{'method': method, 'node': self.node.uuid,
|
||||
'error': error}, str(e))
|
||||
|
||||
def test__command_error_code(self):
|
||||
response_text = '{"faultstring": "you dun goofd"}'
|
||||
self.client.session.post.return_value = MockResponse(
|
||||
|
@ -0,0 +1,10 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
The cleaning operation may fail, if an in-band clean step were to
|
||||
execute after the completion of out-of-band clean step that
|
||||
performs reboot of the node. The failure is caused because of race
|
||||
condition where in cleaning is resumed before the Ironic Python
|
||||
Agent(IPA) is ready to execute clean steps. This has been fixed.
|
||||
For more information, see `bug 2002731
|
||||
<https://storyboard.openstack.org/#!/story/2002731>`_.
|
Loading…
x
Reference in New Issue
Block a user