From 2ef5f9290d37f9b7347899f68f43b22fde5659b0 Mon Sep 17 00:00:00 2001 From: Joshua Kraitberg Date: Mon, 30 Dec 2024 12:41:27 -0500 Subject: [PATCH] Improved sw-manager logs and output Made several changes: * Added host names to `sw-manager *-strategy show` output * Improved logging for alarms * Improved how alarms are displayed in `sw-manager *-strategy show` * Fixed a silent bug that caused duplicate alarms in DB * Fixed a bug in ``sw-manager *-strategy show` that showed incorrect step on timeout * Fixed a bug that caused misalignment in logs when PID became large * Added unified phase_reason on apply success TEST PLAN PASS: AIO-SX minor upgrade RR PASS: AIO-SX minor upgrade NRR PASS: AIO-DX minor upgrade RR Story: 2011045 Bug: 51566 Change-Id: Ic9c989c191892f16891c3ca5303d9780a1afdb2c Signed-off-by: Joshua Kraitberg --- .../nfv_client/sw_update/_sw_update.py | 9 +- nfv/nfv-common/nfv_common/debug/_debug_log.py | 14 ++- .../nfvi_plugins/nfvi_infrastructure_api.py | 3 +- nfv/nfv-vim/nfv_vim/strategy/_strategy.py | 16 ++- .../nfv_vim/strategy/_strategy_steps.py | 119 ++++++++++-------- 5 files changed, 103 insertions(+), 58 deletions(-) diff --git a/nfv/nfv-client/nfv_client/sw_update/_sw_update.py b/nfv/nfv-client/nfv_client/sw_update/_sw_update.py index 4cd1c796..6cb415d3 100755 --- a/nfv/nfv-client/nfv_client/sw_update/_sw_update.py +++ b/nfv/nfv-client/nfv_client/sw_update/_sw_update.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2016-2024 Wind River Systems, Inc. +# Copyright (c) 2016-2025 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -104,7 +104,10 @@ def _get_current_stage_and_step(strategy): if stage.inprogress or stage == current_phase.stages[-1]: for step in current_stage.steps: current_step = step - if step.result not in ['initial', 'success'] or step == current_stage.steps[-1]: + if ( + step.result not in ['initial', 'timed-out', 'success'] or + step == current_stage.steps[-1] + ): break break @@ -205,6 +208,8 @@ def _display_strategy(strategy, details=False, active=False, error_details=False _print(2, "current-stage", current_stage.stage_name) if current_step: _print(2, "current-step", current_step.step_name) + if current_step.entity_type == "hosts" and current_step.entity_names: + _print(2, "entity-names", current_step.entity_names) _print(2, "current-phase-completion", ("%s%%" % strategy.current_phase_completion_percentage)) _print(2, "state", strategy.state) diff --git a/nfv/nfv-common/nfv_common/debug/_debug_log.py b/nfv/nfv-common/nfv_common/debug/_debug_log.py index 518ea01e..4b8e0aa3 100755 --- a/nfv/nfv-common/nfv_common/debug/_debug_log.py +++ b/nfv/nfv-common/nfv_common/debug/_debug_log.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2016 Wind River Systems, Inc. +# Copyright (c) 2015-2016,2025 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -8,6 +8,7 @@ import functools import inspect import logging import os +from pathlib import Path import six import sys @@ -40,10 +41,17 @@ class DebugLogHandler(logging.Handler): self.process_name = None self.thread_name = None + pid_size = 7 + try: + max_pid = Path("/proc/sys/kernel/pid_max").read_text() + pid_size = len(str(int(max_pid))) + except Exception: + pass + # To keep syslog-ng happy, we need to add the who field twice. Newer # syslog-ng removes the header formatting - fmt = ("%(asctime)s %(who)36s[%(process)d]: %(who)36s[%(process)d] " - "%(levelname)8s %(message)s") + fmt = (f"%(asctime)s %(who)36s[%(process)d]: %(who)36s[%(process){pid_size}d] " + "%(levelname)8s %(message)s") formatter = DebugLogFormatter(fmt) self.setFormatter(formatter) diff --git a/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/nfvi_infrastructure_api.py b/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/nfvi_infrastructure_api.py index 089583a5..9d2035cb 100755 --- a/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/nfvi_infrastructure_api.py +++ b/nfv/nfv-plugins/nfv_plugins/nfvi_plugins/nfvi_infrastructure_api.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2024 Wind River Systems, Inc. +# Copyright (c) 2015-2025 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -4263,6 +4263,7 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI): alarm_data['reason_text'], alarm_data['timestamp'], alarm_data['mgmt_affecting']) alarms.append(alarm) + alarms.sort(key=lambda x: x.alarm_id) response['result-data'] = alarms response['completed'] = True diff --git a/nfv/nfv-vim/nfv_vim/strategy/_strategy.py b/nfv/nfv-vim/nfv_vim/strategy/_strategy.py index cdd5fa92..08443f09 100755 --- a/nfv/nfv-vim/nfv_vim/strategy/_strategy.py +++ b/nfv/nfv-vim/nfv_vim/strategy/_strategy.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2024 Wind River Systems, Inc. +# Copyright (c) 2015-2025 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -2356,6 +2356,20 @@ class SwUpgradeStrategy( return self._build_complete_normal(result, result_reason) + def apply_complete(self, result, result_reason): + """ + Strategy Apply Complete + """ + + # On success we change the reason + if result == strategy.STRATEGY_RESULT.SUCCESS: + if self._rollback: + result_reason = f"Rollback to release={self._release} was successful" + else: + result_reason = f"Upgrade to release={self._release} was successful" + + super(SwUpgradeStrategy, self).apply_complete(result, result_reason) + def from_dict(self, data, build_phase=None, apply_phase=None, abort_phase=None): """ Initializes a software upgrade strategy object using the given dictionary diff --git a/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py b/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py index c71eb4b1..c8e50251 100755 --- a/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py +++ b/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2024 Wind River Systems, Inc. +# Copyright (c) 2015-2025 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -2531,35 +2531,39 @@ class QueryAlarmsStep(strategy.StrategyStep): if self.strategy is not None: nfvi_alarms = self.strategy.nfvi_alarms for nfvi_alarm in response['result-data']: - if (self.strategy._alarm_restrictions == + if (nfvi_alarm.alarm_id in self._ignore_alarms or + nfvi_alarm.alarm_id in self._ignore_alarms_conditional): + DLOG.info("Strategy ignores alarm: id=%s, uuid=%s" % + (nfvi_alarm.alarm_id, nfvi_alarm.alarm_uuid)) + elif (self.strategy._alarm_restrictions == strategy.STRATEGY_ALARM_RESTRICTION_TYPES.RELAXED and nfvi_alarm.mgmt_affecting == 'False'): - DLOG.warn("Ignoring non-management affecting alarm " - "%s - uuid %s due to relaxed alarm " - "strictness" % (nfvi_alarm.alarm_id, - nfvi_alarm.alarm_uuid)) + DLOG.warn("Relaxed mode ignores alarm: " + "id=%s, uuid=%s" % (nfvi_alarm.alarm_id, + nfvi_alarm.alarm_uuid)) elif (self.strategy._alarm_restrictions == strategy.STRATEGY_ALARM_RESTRICTION_TYPES.PERMISSIVE): - DLOG.warn("Ignoring alarm " - "%s - uuid %s due to permissive alarm " - "strictness" % (nfvi_alarm.alarm_id, - nfvi_alarm.alarm_uuid)) - elif (nfvi_alarm.alarm_id not in self._ignore_alarms and - nfvi_alarm.alarm_id not in self._ignore_alarms_conditional): - DLOG.warn("Alarm: %s" % nfvi_alarm.alarm_id) - nfvi_alarms.append(nfvi_alarm) + DLOG.warn("Permissive mode ignores alarm: " + "id=%s, uuid=%s" % (nfvi_alarm.alarm_id, + nfvi_alarm.alarm_uuid)) else: - DLOG.warn("Ignoring alarm %s - uuid %s" % - (nfvi_alarm.alarm_id, nfvi_alarm.alarm_uuid)) + DLOG.warn("Detected alarm: %s" % nfvi_alarm.alarm_id) + nfvi_alarms.append(nfvi_alarm) self.strategy.nfvi_alarms = nfvi_alarms - if self._fail_on_alarms and self.strategy.nfvi_alarms: - result = strategy.STRATEGY_STEP_RESULT.FAILED - alarm_ids = [str(alarm.get('alarm_id')) for alarm in self.strategy.nfvi_alarms] - reason = "alarms %s from %s are present" % (alarm_ids, fm_service) + if self.strategy.nfvi_alarms: + result = ( + strategy.STRATEGY_STEP_RESULT.FAILED + if self._fail_on_alarms + else strategy.STRATEGY_STEP_RESULT.SUCCESS + ) + reason = ( + f"Unignored {fm_service} alarms are present: " + f"{json.dumps([v.as_dict() for v in self.strategy.nfvi_alarms], indent=2)}" + ) else: result = strategy.STRATEGY_STEP_RESULT.SUCCESS - reason = "" + reason = "No unignored alarms present" self.stage.step_complete(result, reason) else: @@ -2631,24 +2635,22 @@ class WaitDataSyncStep(strategy.StrategyStep): if self.strategy is not None: nfvi_alarms = list() for nfvi_alarm in response['result-data']: - if (self.strategy._alarm_restrictions == + if nfvi_alarm.alarm_id in self._ignore_alarms: + DLOG.info("Strategy ignores alarm: id=%s, uuid=%s" % + (nfvi_alarm.alarm_id, nfvi_alarm.alarm_uuid)) + elif (self.strategy._alarm_restrictions == strategy.STRATEGY_ALARM_RESTRICTION_TYPES.RELAXED and nfvi_alarm.mgmt_affecting == 'False'): - DLOG.warn("Ignoring non-management affecting alarm " - "%s - uuid %s due to relaxed alarm " - "strictness" % (nfvi_alarm.alarm_id, - nfvi_alarm.alarm_uuid)) + DLOG.warn("Relaxed mode ignores alarm: " + "id=%s, uuid=%s" % (nfvi_alarm.alarm_id, + nfvi_alarm.alarm_uuid)) elif (self.strategy._alarm_restrictions == strategy.STRATEGY_ALARM_RESTRICTION_TYPES.PERMISSIVE): - DLOG.warn("Ignoring alarm " - "%s - uuid %s due to permissive alarm " - "strictness" % (nfvi_alarm.alarm_id, - nfvi_alarm.alarm_uuid)) - elif nfvi_alarm.alarm_id not in self._ignore_alarms: - nfvi_alarms.append(nfvi_alarm) + DLOG.warn("Permissive mode ignores alarm: " + "id=%s, uuid=%s" % (nfvi_alarm.alarm_id, + nfvi_alarm.alarm_uuid)) else: - DLOG.debug("Ignoring alarm %s - uuid %s" % - (nfvi_alarm.alarm_id, nfvi_alarm.alarm_uuid)) + nfvi_alarms.append(nfvi_alarm) self.strategy.nfvi_alarms = nfvi_alarms if self.strategy.nfvi_alarms: @@ -2749,20 +2751,21 @@ class WaitAlarmsClearStep(strategy.StrategyStep): if self.strategy is not None: nfvi_alarms = list() for nfvi_alarm in response['result-data']: - if (self.strategy._alarm_restrictions == + if nfvi_alarm.alarm_id in self._ignore_alarms: + DLOG.info("Strategy ignores alarm: id=%s, uuid=%s" % + (nfvi_alarm.alarm_id, nfvi_alarm.alarm_uuid)) + elif (self.strategy._alarm_restrictions == strategy.STRATEGY_ALARM_RESTRICTION_TYPES.RELAXED and nfvi_alarm.mgmt_affecting == 'False'): - DLOG.warn("Ignoring non-management affecting alarm " - "%s - uuid %s due to relaxed alarm " - "strictness" % (nfvi_alarm.alarm_id, - nfvi_alarm.alarm_uuid)) + DLOG.warn("Relaxed mode ignores alarm: " + "id=%s, uuid=%s" % (nfvi_alarm.alarm_id, + nfvi_alarm.alarm_uuid)) elif (self.strategy._alarm_restrictions == strategy.STRATEGY_ALARM_RESTRICTION_TYPES.PERMISSIVE): - DLOG.warn("Ignoring alarm " - "%s - uuid %s due to permissive alarm " - "strictness" % (nfvi_alarm.alarm_id, - nfvi_alarm.alarm_uuid)) - elif nfvi_alarm.alarm_id not in self._ignore_alarms: + DLOG.warn("Permissive mode ignores alarm: " + "id=%s, uuid=%s" % (nfvi_alarm.alarm_id, + nfvi_alarm.alarm_uuid)) + else: # For ignoring stale alarm(currently 750.006) if nfvi_alarm.alarm_id in self._ignore_alarms_conditional: format_string = "%Y-%m-%dT%H:%M:%S.%f" @@ -2784,10 +2787,6 @@ class WaitAlarmsClearStep(strategy.StrategyStep): else: nfvi_alarms.append(nfvi_alarm) - nfvi_alarms.append(nfvi_alarm) - else: - DLOG.debug("Ignoring alarm %s - uuid %s" % - (nfvi_alarm.alarm_id, nfvi_alarm.alarm_uuid)) self.strategy.nfvi_alarms = nfvi_alarms if self.strategy.nfvi_alarms: @@ -2798,9 +2797,15 @@ class WaitAlarmsClearStep(strategy.StrategyStep): # Removes only the alarm which has # not yet reached specified timeout. self.strategy.nfvi_alarms.remove(alarm) - # Keep waiting for alarms to clear - pass - else: + + for v in self.strategy.nfvi_alarms: + DLOG.info( + "Waiting for unignored alarm to clear " + f"id={v.alarm_id}, uuid={v.alarm_uuid}" + ) + + # Do not use elif, nfvi_alarms can be modified during previous block + if not self.strategy.nfvi_alarms: # Alarms have all cleared result = strategy.STRATEGY_STEP_RESULT.SUCCESS self.stage.step_complete(result, "") @@ -2864,6 +2869,18 @@ class WaitAlarmsClearStep(strategy.StrategyStep): data['ignore_alarms_conditional'] = self._ignore_alarms_conditional return data + def timeout(self): + """ + Strategy Step Timeout Override + """ + + result, _ = super(WaitAlarmsClearStep, self).timeout() + reason = ( + "Unignored alarms did not clear before timeout: " + f"{json.dumps([v.as_dict() for v in self.strategy.nfvi_alarms], indent=2)}" + ) + return result, reason + class QuerySwPatchesStep(strategy.StrategyStep): """