Merge "Fix agent from being locked out with complex steps"

This commit is contained in:
Zuul 2025-01-30 18:39:18 +00:00 committed by Gerrit Code Review
commit 2c3a869388
10 changed files with 53 additions and 5 deletions

View File

@ -106,7 +106,8 @@ def set_node_flags(node, reboot=None, skip_current_step=None, polling=None,
:param polling: Boolean value to set for node's driver_internal_info flag
deployment_polling, servicing_polling or cleaning_polling. If it is
None, the corresponding polling flag is not set in the node's
driver_internal_info.
driver_internal_info. A polling flag is otherwise deleted from
the node's driver_internal_info.
:param step_type: The type of steps to process: 'clean', 'service'
or 'deploy'. If None, detected from the node.
"""
@ -134,6 +135,9 @@ def set_node_flags(node, reboot=None, skip_current_step=None, polling=None,
node.set_driver_internal_info(skip_field, skip_current_step)
if polling is not None:
node.set_driver_internal_info(polling_field, polling)
else:
# Always unset the value *if* previously set.
node.del_driver_internal_info(polling_field)
node.save()

View File

@ -175,6 +175,11 @@ def do_next_clean_step(task, step_index, disable_ramdisk=None):
eocn = step.get('execute_on_child_nodes', False)
result = None
try:
if async_steps.CLEANING_POLLING in node.driver_internal_info:
# We're executing a new step, so a prior polling
# flag should be dictated by the new step, *not* a
# prior step.
node.del_driver_internal_info(async_steps.CLEANING_POLLING)
if not eocn:
LOG.info('Executing %(step)s on node %(node)s',
{'step': step, 'node': node.uuid})

View File

@ -340,6 +340,10 @@ def do_next_deploy_step(task, step_index):
child_node_execution = step.get('execute_on_child_nodes', False)
result = None
try:
if async_steps.DEPLOYMENT_POLLING in node.driver_internal_info:
# We're going to execute a new step, we should delete any
# older/out of date option.
node.del_driver_internal_info(async_steps.DEPLOYMENT_POLLING)
if not child_node_execution:
interface = getattr(task.driver, step.get('interface'))
LOG.info('Executing %(step)s on node %(node)s',

View File

@ -133,6 +133,10 @@ def do_next_service_step(task, step_index, disable_ramdisk=None):
eocn = step.get('execute_on_child_nodes', False)
result = None
try:
if async_steps.SERVICING_POLLING in node.driver_internal_info:
# We're going to execute a new step, we should delete any
# older/out of date option.
node.del_driver_internal_info(async_steps.SERVICING_POLLING)
if not eocn:
LOG.info('Executing %(step)s on node %(node)s',
{'step': step, 'node': node.uuid})

View File

@ -607,7 +607,8 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
target_provision_state=tgt_prov_state,
last_error=None,
driver_internal_info={'clean_steps': self.clean_steps,
'clean_step_index': 0},
'clean_step_index': 0,
'cleaning_polling': True},
clean_step=self.clean_steps[0])
mock_execute.return_value = return_state
@ -623,6 +624,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
self.assertEqual(1, node.driver_internal_info['clean_step_index'])
mock_execute.assert_called_once_with(
mock.ANY, mock.ANY, self.clean_steps[1])
self.assertNotIn('cleaning_polling', node.driver_internal_info)
def test_do_next_clean_step_continue_from_last_cleaning(self):
self._do_next_clean_step_continue_from_last_cleaning(states.CLEANWAIT)

View File

@ -667,7 +667,8 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin,
autospec=True)
def test__do_next_deploy_step_in_deploywait(self, mock_execute):
driver_internal_info = {'deploy_step_index': None,
'deploy_steps': self.deploy_steps}
'deploy_steps': self.deploy_steps,
'deployment_polling': True}
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
@ -692,6 +693,7 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin,
self.assertEqual(states.ACTIVE, node.target_provision_state)
self.assertEqual(expected_first_step, node.deploy_step)
self.assertEqual(0, node.driver_internal_info['deploy_step_index'])
self.assertNotIn('deployment_polling', node.driver_internal_info)
mock_execute.assert_called_once_with(mock.ANY, task,
self.deploy_steps[0])

View File

@ -374,7 +374,8 @@ class DoNodeServiceTestCase(db_base.DbTestCase):
target_provision_state=tgt_prov_state,
last_error=None,
driver_internal_info={'service_steps': self.service_steps,
'service_step_index': 0},
'service_step_index': 0,
'servicing_polling': True},
service_step=self.service_steps[0])
mock_execute.return_value = return_state
@ -390,6 +391,7 @@ class DoNodeServiceTestCase(db_base.DbTestCase):
self.assertEqual(1, node.driver_internal_info['service_step_index'])
mock_execute.assert_called_once_with(
mock.ANY, mock.ANY, self.service_steps[1])
self.assertNotIn('servicing_polling', node.driver_internal_info)
def test_do_next_clean_step_continue_from_last_cleaning(self):
self._do_next_clean_step_continue_from_last_cleaning(

View File

@ -3307,3 +3307,15 @@ class AsyncStepTestCase(db_base.DbTestCase):
skip_current_step=True,
polling=True)
self.assertEqual(expected, self.node.driver_internal_info)
def test_set_async_step_flags_clears_polling_if_not_set(self):
self.node.clean_step = {'step': 'create_configuration',
'interface': 'raid'}
self.node.driver_internal_info = {'agent_secret_token': 'test',
'cleaning_polling': True}
expected = {'cleaning_reboot': True,
'skip_current_clean_step': True}
self.node.save()
utils.set_async_step_flags(self.node, reboot=True,
skip_current_step=True)
self.assertEqual(expected, self.node.driver_internal_info)

View File

@ -0,0 +1,13 @@
---
fixes:
- |
Fixes an issue where operators executing complex arrangement of steps
which include out-of-band and in-band steps, for example a hardware
RAID ``create_configuration`` step followed by in-band steps inside of
the agent, would effectively get the agent stuck in a ``wait`` state in
the Cleaning, Servicing, or Deploying workflows.
This was related to the way out-of-band steps are executed and monitored.
Ironic, before starting to execute a new step, now cleans the polling
lockout flag for the respective workflow being executed to prevent the
agent from getting stuck. For more information, please see
`bug 2096938 <https://bugs.launchpad.net/ironic/+bug/2096938>`_.

View File

@ -165,7 +165,7 @@ filename = *.py,app.wsgi
exclude=.*,dist,doc,*lib/python*,*egg,build
import-order-style = pep8
application-import-names = ironic
max-complexity=19
max-complexity=20
# [H106] Don't put vim configuration in source files.
# [H203] Use assertIs(Not)None to check for None.
# [H204] Use assert(Not)Equal to check for equality.