From ff2030ce98dbe470f8d51d308e184cc065d491e0 Mon Sep 17 00:00:00 2001
From: Dmitry Tantsur <dtantsur@protonmail.com>
Date: Wed, 8 Apr 2020 11:09:06 +0200
Subject: [PATCH] Improve the command status checks in the agent's
 process_next_step

1) When dealing with in-band deploy steps, the command list won't be
   empty on first boot, it will contain get_deploy_steps. Account for
   that when detecting the first boot situation.

2) Ignore priorities when checking which step is running. They may
   differ when an in-band step is shadowed by an out-of-band one (and
   IPA does not allow duplicated steps).

3) Make sure to return control to the conductor when agent doesn't
   have any current commands (may happen if process_next_steps is called
   on the first heartbeat, which will be the case for deploy steps).

4) Clarify log messages in presence of both in-band and OOB steps.

Story: #2006963
Change-Id: Ie98a4138e5d960d896d00945685b686a50ae928e
---
 ironic/conductor/steps.py                     |  6 +++
 ironic/drivers/modules/agent_base.py          | 52 +++++++++++--------
 .../unit/drivers/modules/test_agent_base.py   | 15 ++++++
 3 files changed, 51 insertions(+), 22 deletions(-)

diff --git a/ironic/conductor/steps.py b/ironic/conductor/steps.py
index e67f6e3bb1..3fb539e34a 100644
--- a/ironic/conductor/steps.py
+++ b/ironic/conductor/steps.py
@@ -81,6 +81,12 @@ def _sorted_steps(steps, sort_step_key):
     return sorted(steps, key=sort_step_key, reverse=True)
 
 
+def is_equivalent(step1, step2):
+    """Compare steps, ignoring their priority."""
+    return (step1.get('interface') == step2.get('interface')
+            and step1.get('step') == step2.get('step'))
+
+
 def _get_steps(task, interfaces, get_method, enabled=False,
                sort_step_key=None):
     """Get steps for task.node.
diff --git a/ironic/drivers/modules/agent_base.py b/ironic/drivers/modules/agent_base.py
index be7a7dd73e..6eb2312b40 100644
--- a/ironic/drivers/modules/agent_base.py
+++ b/ironic/drivers/modules/agent_base.py
@@ -29,6 +29,7 @@ from ironic.common import exception
 from ironic.common.i18n import _
 from ironic.common import image_service
 from ironic.common import states
+from ironic.common import utils
 from ironic.conductor import steps as conductor_steps
 from ironic.conductor import utils as manager_utils
 from ironic.conf import CONF
@@ -213,6 +214,20 @@ def _post_step_reboot(task, step_type):
     task.node.save()
 
 
+def _freshly_booted(commands, step_type):
+    """Check if the ramdisk has just started.
+
+    On the very first boot we fetch the available steps, hence the only command
+    agent executed will be get_XXX_steps. For later reboots the list of
+    commands will be empty.
+    """
+    return (
+        not commands
+        or (len(commands) == 1
+            and commands[0]['command_name'] == 'get_%s_steps' % step_type)
+    )
+
+
 def _get_completed_command(task, commands, step_type):
     """Returns None or a completed clean/deploy command from the agent.
 
@@ -220,8 +235,7 @@ def _get_completed_command(task, commands, step_type):
     :param commands: a set of command results from the agent, typically
                      fetched with agent_client.get_commands_status().
     """
-    if not commands:
-        return
+    assert commands, 'BUG: _get_completed_command called with no commands'
 
     last_command = commands[-1]
 
@@ -229,8 +243,8 @@ def _get_completed_command(task, commands, step_type):
         # catches race condition where execute_step is still
         # processing so the command hasn't started yet
         LOG.debug('Expected agent last command to be "execute_%(type)s_step" '
-                  'for node %(node)s, instead got "%(command)s". Waiting '
-                  'for next heartbeat.',
+                  'for node %(node)s, instead got "%(command)s". An out-of-'
+                  'band step may be running. Waiting for next heartbeat.',
                   {'node': task.node.uuid,
                    'command': last_command['command_name'],
                    'type': step_type})
@@ -246,11 +260,14 @@ def _get_completed_command(task, commands, step_type):
                    'type': step_type.capitalize()})
         return
     elif (last_command['command_status'] == 'SUCCEEDED'
-          and last_step != current_step):
+          and (not last_step
+               or not conductor_steps.is_equivalent(last_step, current_step))):
         # A previous step was running, the new command has not yet started.
-        LOG.debug('%(type)s step not yet started for node %(node)s: %(step)s',
-                  {'step': last_step, 'node': task.node.uuid,
-                   'type': step_type.capitalize()})
+        LOG.debug('%(type)s step %(step)s is not currently running for node '
+                  '%(node)s. Not yet started or an out-of-band step is in '
+                  'progress. The last finished step is %(previous)s.',
+                  {'step': current_step, 'node': task.node.uuid,
+                   'type': step_type.capitalize(), 'previous': last_step})
         return
     else:
         return last_command
@@ -807,22 +824,13 @@ class AgentDeployMixin(HeartbeatMixin):
         node = task.node
         agent_commands = self._client.get_commands_status(task.node)
 
-        if not agent_commands:
+        if _freshly_booted(agent_commands, step_type):
             field = ('cleaning_reboot' if step_type == 'clean'
                      else 'deployment_reboot')
-            if task.node.driver_internal_info.get(field):
-                # Node finished a cleaning step that requested a reboot, and
-                # this is the first heartbeat after booting. Continue cleaning.
-                info = task.node.driver_internal_info
-                info.pop(field, None)
-                task.node.driver_internal_info = info
-                task.node.save()
-                manager_utils.notify_conductor_resume_operation(task,
-                                                                step_type)
-                return
-            else:
-                # Agent has no commands whatsoever
-                return
+            utils.pop_node_nested_field(
+                task.node, 'driver_internal_info', field)
+            manager_utils.notify_conductor_resume_operation(task, step_type)
+            return
 
         current_step = (node.clean_step if step_type == 'clean'
                         else node.deploy_step)
diff --git a/ironic/tests/unit/drivers/modules/test_agent_base.py b/ironic/tests/unit/drivers/modules/test_agent_base.py
index 30a0cb9ddd..c4f4f722c8 100644
--- a/ironic/tests/unit/drivers/modules/test_agent_base.py
+++ b/ironic/tests/unit/drivers/modules/test_agent_base.py
@@ -1806,6 +1806,21 @@ class AgentDeployMixinTest(AgentDeployMixinBaseTest):
             self.deploy.continue_cleaning(task)
             self.assertFalse(notify_mock.called)
 
+    @mock.patch.object(manager_utils, 'notify_conductor_resume_operation',
+                       autospec=True)
+    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
+                       autospec=True)
+    def test_continue_cleaning_no_step_running(self, status_mock, notify_mock):
+        status_mock.return_value = [{
+            'command_status': 'SUCCEEDED',
+            'command_name': 'get_clean_steps',
+            'command_result': []
+        }]
+        with task_manager.acquire(self.context, self.node['uuid'],
+                                  shared=False) as task:
+            self.deploy.continue_cleaning(task)
+            notify_mock.assert_called_once_with(task, 'clean')
+
     @mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True)
     @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
                        autospec=True)