Improved sw-manager logs and output

Made several changes: * Added host names to `sw-manager *-strategy show` output * Improved logging for alarms * Improved how alarms are displayed in `sw-manager *-strategy show` * Fixed a silent bug that caused duplicate alarms in DB * Fixed a bug in ``sw-manager *-strategy show` that showed incorrect step on timeout * Fixed a bug that caused misalignment in logs when PID became large * Added unified phase_reason on apply success TEST PLAN PASS: AIO-SX minor upgrade RR PASS: AIO-SX minor upgrade NRR PASS: AIO-DX minor upgrade RR Story: 2011045 Bug: 51566 Change-Id: Ic9c989c191892f16891c3ca5303d9780a1afdb2c Signed-off-by: Joshua Kraitberg <joshua.kraitberg@windriver.com>
2024-12-30 12:41:27 -05:00 · 2024-12-30 12:41:27 -05:00 · 2ef5f9290d
commit 2ef5f9290d
parent a8d6da428e
5 changed files with 103 additions and 58 deletions
--- a/nfv/nfv-client/nfv_client/sw_update/_sw_update.py
+++ b/nfv/nfv-client/nfv_client/sw_update/_sw_update.py
@ -1,5 +1,5 @@
 #
-# Copyright (c) 2016-2024 Wind River Systems, Inc.
+# Copyright (c) 2016-2025 Wind River Systems, Inc.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@ -104,7 +104,10 @@ def _get_current_stage_and_step(strategy):
        if stage.inprogress or stage == current_phase.stages[-1]:
            for step in current_stage.steps:
                current_step = step
-                if step.result not in ['initial', 'success'] or step == current_stage.steps[-1]:
+                if (
                        step.result not in ['initial', 'timed-out', 'success'] or
                        step == current_stage.steps[-1]
                ):
                    break
            break
@ -205,6 +208,8 @@ def _display_strategy(strategy, details=False, active=False, error_details=False
        _print(2, "current-stage", current_stage.stage_name)
    if current_step:
        _print(2, "current-step", current_step.step_name)
        if current_step.entity_type == "hosts" and current_step.entity_names:
            _print(2, "entity-names", current_step.entity_names)
    _print(2, "current-phase-completion",
           ("%s%%" % strategy.current_phase_completion_percentage))
    _print(2, "state", strategy.state)
--- a/nfv/nfv-common/nfv_common/debug/_debug_log.py
+++ b/nfv/nfv-common/nfv_common/debug/_debug_log.py
@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2016 Wind River Systems, Inc.
+# Copyright (c) 2015-2016,2025 Wind River Systems, Inc.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@ -8,6 +8,7 @@ import functools
 import inspect
 import logging
 import os
 from pathlib import Path
 import six
 import sys
@ -40,10 +41,17 @@ class DebugLogHandler(logging.Handler):
        self.process_name = None
        self.thread_name = None
        pid_size = 7
        try:
            max_pid = Path("/proc/sys/kernel/pid_max").read_text()
            pid_size = len(str(int(max_pid)))
        except Exception:
            pass
        # To keep syslog-ng happy, we need to add the who field twice.  Newer
        # syslog-ng removes the header formatting
-        fmt = ("%(asctime)s %(who)36s[%(process)d]: %(who)36s[%(process)d] "
+        fmt = (f"%(asctime)s %(who)36s[%(process)d]: %(who)36s[%(process){pid_size}d] "
-               "%(levelname)8s %(message)s")
+                "%(levelname)8s %(message)s")
        formatter = DebugLogFormatter(fmt)
        self.setFormatter(formatter)
--- a/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/nfvi_infrastructure_api.py
+++ b/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/nfvi_infrastructure_api.py
@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2024 Wind River Systems, Inc.
+# Copyright (c) 2015-2025 Wind River Systems, Inc.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@ -4263,6 +4263,7 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
                    alarm_data['reason_text'], alarm_data['timestamp'],
                    alarm_data['mgmt_affecting'])
                alarms.append(alarm)
            alarms.sort(key=lambda x: x.alarm_id)
            response['result-data'] = alarms
            response['completed'] = True
--- a/nfv/nfv-vim/nfv_vim/strategy/_strategy.py
+++ b/nfv/nfv-vim/nfv_vim/strategy/_strategy.py
@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2024 Wind River Systems, Inc.
+# Copyright (c) 2015-2025 Wind River Systems, Inc.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@ -2356,6 +2356,20 @@ class SwUpgradeStrategy(
        return self._build_complete_normal(result, result_reason)
    def apply_complete(self, result, result_reason):
        """
        Strategy Apply Complete
        """
        # On success we change the reason
        if result == strategy.STRATEGY_RESULT.SUCCESS:
            if self._rollback:
                result_reason = f"Rollback to release={self._release} was successful"
            else:
                result_reason = f"Upgrade to release={self._release} was successful"
        super(SwUpgradeStrategy, self).apply_complete(result, result_reason)
    def from_dict(self, data, build_phase=None, apply_phase=None, abort_phase=None):
        """
        Initializes a software upgrade strategy object using the given dictionary
--- a/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py
+++ b/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py
@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2024 Wind River Systems, Inc.
+# Copyright (c) 2015-2025 Wind River Systems, Inc.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@ -2531,35 +2531,39 @@ class QueryAlarmsStep(strategy.StrategyStep):
            if self.strategy is not None:
                nfvi_alarms = self.strategy.nfvi_alarms
                for nfvi_alarm in response['result-data']:
-                    if (self.strategy._alarm_restrictions ==
+                    if (nfvi_alarm.alarm_id in self._ignore_alarms or
                            nfvi_alarm.alarm_id in self._ignore_alarms_conditional):
                        DLOG.info("Strategy ignores alarm: id=%s, uuid=%s" %
                                   (nfvi_alarm.alarm_id, nfvi_alarm.alarm_uuid))
                    elif (self.strategy._alarm_restrictions ==
                            strategy.STRATEGY_ALARM_RESTRICTION_TYPES.RELAXED and
                            nfvi_alarm.mgmt_affecting == 'False'):
-                        DLOG.warn("Ignoring non-management affecting alarm "
+                        DLOG.warn("Relaxed mode ignores alarm: "
-                                  "%s - uuid %s due to relaxed alarm "
+                                  "id=%s, uuid=%s" % (nfvi_alarm.alarm_id,
-                                  "strictness" % (nfvi_alarm.alarm_id,
+                                                      nfvi_alarm.alarm_uuid))
                                                  nfvi_alarm.alarm_uuid))
                    elif (self.strategy._alarm_restrictions ==
                            strategy.STRATEGY_ALARM_RESTRICTION_TYPES.PERMISSIVE):
-                        DLOG.warn("Ignoring alarm "
+                        DLOG.warn("Permissive mode ignores alarm: "
-                                  "%s - uuid %s due to permissive alarm "
+                                  "id=%s, uuid=%s" % (nfvi_alarm.alarm_id,
-                                  "strictness" % (nfvi_alarm.alarm_id,
+                                                      nfvi_alarm.alarm_uuid))
                                                  nfvi_alarm.alarm_uuid))
                    elif (nfvi_alarm.alarm_id not in self._ignore_alarms and
                            nfvi_alarm.alarm_id not in self._ignore_alarms_conditional):
                        DLOG.warn("Alarm: %s" % nfvi_alarm.alarm_id)
                        nfvi_alarms.append(nfvi_alarm)
                    else:
-                        DLOG.warn("Ignoring alarm %s - uuid %s" %
+                        DLOG.warn("Detected alarm: %s" % nfvi_alarm.alarm_id)
-                                  (nfvi_alarm.alarm_id, nfvi_alarm.alarm_uuid))
+                        nfvi_alarms.append(nfvi_alarm)
                self.strategy.nfvi_alarms = nfvi_alarms
-            if self._fail_on_alarms and self.strategy.nfvi_alarms:
+            if self.strategy.nfvi_alarms:
-                result = strategy.STRATEGY_STEP_RESULT.FAILED
+                result = (
-                alarm_ids = [str(alarm.get('alarm_id')) for alarm in self.strategy.nfvi_alarms]
+                    strategy.STRATEGY_STEP_RESULT.FAILED
-                reason = "alarms %s from %s are present" % (alarm_ids, fm_service)
+                    if self._fail_on_alarms
                    else strategy.STRATEGY_STEP_RESULT.SUCCESS
                )
                reason = (
                    f"Unignored {fm_service} alarms are present: "
                    f"{json.dumps([v.as_dict() for v in self.strategy.nfvi_alarms], indent=2)}"
                )
            else:
                result = strategy.STRATEGY_STEP_RESULT.SUCCESS
-                reason = ""
+                reason = "No unignored alarms present"
            self.stage.step_complete(result, reason)
        else:
@ -2631,24 +2635,22 @@ class WaitDataSyncStep(strategy.StrategyStep):
            if self.strategy is not None:
                nfvi_alarms = list()
                for nfvi_alarm in response['result-data']:
-                    if (self.strategy._alarm_restrictions ==
+                    if nfvi_alarm.alarm_id in self._ignore_alarms:
                        DLOG.info("Strategy ignores alarm: id=%s, uuid=%s" %
                                   (nfvi_alarm.alarm_id, nfvi_alarm.alarm_uuid))
                    elif (self.strategy._alarm_restrictions ==
                            strategy.STRATEGY_ALARM_RESTRICTION_TYPES.RELAXED and
                            nfvi_alarm.mgmt_affecting == 'False'):
-                        DLOG.warn("Ignoring non-management affecting alarm "
+                        DLOG.warn("Relaxed mode ignores alarm: "
-                                  "%s - uuid %s due to relaxed alarm "
+                                  "id=%s, uuid=%s" % (nfvi_alarm.alarm_id,
-                                  "strictness" % (nfvi_alarm.alarm_id,
+                                                      nfvi_alarm.alarm_uuid))
                                                  nfvi_alarm.alarm_uuid))
                    elif (self.strategy._alarm_restrictions ==
                            strategy.STRATEGY_ALARM_RESTRICTION_TYPES.PERMISSIVE):
-                        DLOG.warn("Ignoring alarm "
+                        DLOG.warn("Permissive mode ignores alarm: "
-                                  "%s - uuid %s due to permissive alarm "
+                                  "id=%s, uuid=%s" % (nfvi_alarm.alarm_id,
-                                  "strictness" % (nfvi_alarm.alarm_id,
+                                                      nfvi_alarm.alarm_uuid))
                                                  nfvi_alarm.alarm_uuid))
                    elif nfvi_alarm.alarm_id not in self._ignore_alarms:
                        nfvi_alarms.append(nfvi_alarm)
                    else:
-                        DLOG.debug("Ignoring alarm %s - uuid %s" %
+                        nfvi_alarms.append(nfvi_alarm)
                                   (nfvi_alarm.alarm_id, nfvi_alarm.alarm_uuid))
                self.strategy.nfvi_alarms = nfvi_alarms
            if self.strategy.nfvi_alarms:
@ -2749,20 +2751,21 @@ class WaitAlarmsClearStep(strategy.StrategyStep):
            if self.strategy is not None:
                nfvi_alarms = list()
                for nfvi_alarm in response['result-data']:
-                    if (self.strategy._alarm_restrictions ==
+                    if nfvi_alarm.alarm_id in self._ignore_alarms:
                        DLOG.info("Strategy ignores alarm: id=%s, uuid=%s" %
                                   (nfvi_alarm.alarm_id, nfvi_alarm.alarm_uuid))
                    elif (self.strategy._alarm_restrictions ==
                            strategy.STRATEGY_ALARM_RESTRICTION_TYPES.RELAXED and
                            nfvi_alarm.mgmt_affecting == 'False'):
-                        DLOG.warn("Ignoring non-management affecting alarm "
+                        DLOG.warn("Relaxed mode ignores alarm: "
-                                  "%s - uuid %s due to relaxed alarm "
+                                  "id=%s, uuid=%s" % (nfvi_alarm.alarm_id,
-                                  "strictness" % (nfvi_alarm.alarm_id,
+                                                      nfvi_alarm.alarm_uuid))
                                                  nfvi_alarm.alarm_uuid))
                    elif (self.strategy._alarm_restrictions ==
                            strategy.STRATEGY_ALARM_RESTRICTION_TYPES.PERMISSIVE):
-                        DLOG.warn("Ignoring alarm "
+                        DLOG.warn("Permissive mode ignores alarm: "
-                                  "%s - uuid %s due to permissive alarm "
+                                  "id=%s, uuid=%s" % (nfvi_alarm.alarm_id,
-                                  "strictness" % (nfvi_alarm.alarm_id,
+                                                      nfvi_alarm.alarm_uuid))
-                                                  nfvi_alarm.alarm_uuid))
+                    else:
                    elif nfvi_alarm.alarm_id not in self._ignore_alarms:
                        # For ignoring stale alarm(currently 750.006)
                        if nfvi_alarm.alarm_id in self._ignore_alarms_conditional:
                            format_string = "%Y-%m-%dT%H:%M:%S.%f"
@ -2784,10 +2787,6 @@ class WaitAlarmsClearStep(strategy.StrategyStep):
                        else:
                            nfvi_alarms.append(nfvi_alarm)
                        nfvi_alarms.append(nfvi_alarm)
                    else:
                        DLOG.debug("Ignoring alarm %s - uuid %s" %
                                   (nfvi_alarm.alarm_id, nfvi_alarm.alarm_uuid))
                self.strategy.nfvi_alarms = nfvi_alarms
            if self.strategy.nfvi_alarms:
@ -2798,9 +2797,15 @@ class WaitAlarmsClearStep(strategy.StrategyStep):
                            # Removes only the alarm which has
                            # not yet reached specified timeout.
                            self.strategy.nfvi_alarms.remove(alarm)
-                # Keep waiting for alarms to clear
+
-                pass
+            for v in self.strategy.nfvi_alarms:
-            else:
+                DLOG.info(
                    "Waiting for unignored alarm to clear "
                    f"id={v.alarm_id}, uuid={v.alarm_uuid}"
                )
            # Do not use elif, nfvi_alarms can be modified during previous block
            if not self.strategy.nfvi_alarms:
                # Alarms have all cleared
                result = strategy.STRATEGY_STEP_RESULT.SUCCESS
                self.stage.step_complete(result, "")
@ -2864,6 +2869,18 @@ class WaitAlarmsClearStep(strategy.StrategyStep):
        data['ignore_alarms_conditional'] = self._ignore_alarms_conditional
        return data
    def timeout(self):
        """
        Strategy Step Timeout Override
        """
        result, _ = super(WaitAlarmsClearStep, self).timeout()
        reason = (
            "Unignored alarms did not clear before timeout: "
            f"{json.dumps([v.as_dict() for v in self.strategy.nfvi_alarms], indent=2)}"
        )
        return result, reason
 class QuerySwPatchesStep(strategy.StrategyStep):
    """