Improved sw-manager logs and output

Made several changes:
* Added host names to `sw-manager *-strategy show` output
* Improved logging for alarms
* Improved how alarms are displayed in `sw-manager *-strategy show`
* Fixed a silent bug that caused duplicate alarms in DB
* Fixed a bug in ``sw-manager *-strategy show` that showed incorrect
  step on timeout
* Fixed a bug that caused misalignment in logs when PID became large
* Added unified phase_reason on apply success

TEST PLAN
PASS: AIO-SX minor upgrade RR
PASS: AIO-SX minor upgrade NRR
PASS: AIO-DX minor upgrade RR

Story: 2011045
Bug: 51566
Change-Id: Ic9c989c191892f16891c3ca5303d9780a1afdb2c
Signed-off-by: Joshua Kraitberg <joshua.kraitberg@windriver.com>
This commit is contained in:
Joshua Kraitberg 2024-12-30 12:41:27 -05:00
parent a8d6da428e
commit 2ef5f9290d
5 changed files with 103 additions and 58 deletions

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2016-2024 Wind River Systems, Inc.
# Copyright (c) 2016-2025 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -104,7 +104,10 @@ def _get_current_stage_and_step(strategy):
if stage.inprogress or stage == current_phase.stages[-1]:
for step in current_stage.steps:
current_step = step
if step.result not in ['initial', 'success'] or step == current_stage.steps[-1]:
if (
step.result not in ['initial', 'timed-out', 'success'] or
step == current_stage.steps[-1]
):
break
break
@ -205,6 +208,8 @@ def _display_strategy(strategy, details=False, active=False, error_details=False
_print(2, "current-stage", current_stage.stage_name)
if current_step:
_print(2, "current-step", current_step.step_name)
if current_step.entity_type == "hosts" and current_step.entity_names:
_print(2, "entity-names", current_step.entity_names)
_print(2, "current-phase-completion",
("%s%%" % strategy.current_phase_completion_percentage))
_print(2, "state", strategy.state)

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2016 Wind River Systems, Inc.
# Copyright (c) 2015-2016,2025 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -8,6 +8,7 @@ import functools
import inspect
import logging
import os
from pathlib import Path
import six
import sys
@ -40,10 +41,17 @@ class DebugLogHandler(logging.Handler):
self.process_name = None
self.thread_name = None
pid_size = 7
try:
max_pid = Path("/proc/sys/kernel/pid_max").read_text()
pid_size = len(str(int(max_pid)))
except Exception:
pass
# To keep syslog-ng happy, we need to add the who field twice. Newer
# syslog-ng removes the header formatting
fmt = ("%(asctime)s %(who)36s[%(process)d]: %(who)36s[%(process)d] "
"%(levelname)8s %(message)s")
fmt = (f"%(asctime)s %(who)36s[%(process)d]: %(who)36s[%(process){pid_size}d] "
"%(levelname)8s %(message)s")
formatter = DebugLogFormatter(fmt)
self.setFormatter(formatter)

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2024 Wind River Systems, Inc.
# Copyright (c) 2015-2025 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -4263,6 +4263,7 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
alarm_data['reason_text'], alarm_data['timestamp'],
alarm_data['mgmt_affecting'])
alarms.append(alarm)
alarms.sort(key=lambda x: x.alarm_id)
response['result-data'] = alarms
response['completed'] = True

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2024 Wind River Systems, Inc.
# Copyright (c) 2015-2025 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -2356,6 +2356,20 @@ class SwUpgradeStrategy(
return self._build_complete_normal(result, result_reason)
def apply_complete(self, result, result_reason):
"""
Strategy Apply Complete
"""
# On success we change the reason
if result == strategy.STRATEGY_RESULT.SUCCESS:
if self._rollback:
result_reason = f"Rollback to release={self._release} was successful"
else:
result_reason = f"Upgrade to release={self._release} was successful"
super(SwUpgradeStrategy, self).apply_complete(result, result_reason)
def from_dict(self, data, build_phase=None, apply_phase=None, abort_phase=None):
"""
Initializes a software upgrade strategy object using the given dictionary

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2024 Wind River Systems, Inc.
# Copyright (c) 2015-2025 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -2531,35 +2531,39 @@ class QueryAlarmsStep(strategy.StrategyStep):
if self.strategy is not None:
nfvi_alarms = self.strategy.nfvi_alarms
for nfvi_alarm in response['result-data']:
if (self.strategy._alarm_restrictions ==
if (nfvi_alarm.alarm_id in self._ignore_alarms or
nfvi_alarm.alarm_id in self._ignore_alarms_conditional):
DLOG.info("Strategy ignores alarm: id=%s, uuid=%s" %
(nfvi_alarm.alarm_id, nfvi_alarm.alarm_uuid))
elif (self.strategy._alarm_restrictions ==
strategy.STRATEGY_ALARM_RESTRICTION_TYPES.RELAXED and
nfvi_alarm.mgmt_affecting == 'False'):
DLOG.warn("Ignoring non-management affecting alarm "
"%s - uuid %s due to relaxed alarm "
"strictness" % (nfvi_alarm.alarm_id,
nfvi_alarm.alarm_uuid))
DLOG.warn("Relaxed mode ignores alarm: "
"id=%s, uuid=%s" % (nfvi_alarm.alarm_id,
nfvi_alarm.alarm_uuid))
elif (self.strategy._alarm_restrictions ==
strategy.STRATEGY_ALARM_RESTRICTION_TYPES.PERMISSIVE):
DLOG.warn("Ignoring alarm "
"%s - uuid %s due to permissive alarm "
"strictness" % (nfvi_alarm.alarm_id,
nfvi_alarm.alarm_uuid))
elif (nfvi_alarm.alarm_id not in self._ignore_alarms and
nfvi_alarm.alarm_id not in self._ignore_alarms_conditional):
DLOG.warn("Alarm: %s" % nfvi_alarm.alarm_id)
nfvi_alarms.append(nfvi_alarm)
DLOG.warn("Permissive mode ignores alarm: "
"id=%s, uuid=%s" % (nfvi_alarm.alarm_id,
nfvi_alarm.alarm_uuid))
else:
DLOG.warn("Ignoring alarm %s - uuid %s" %
(nfvi_alarm.alarm_id, nfvi_alarm.alarm_uuid))
DLOG.warn("Detected alarm: %s" % nfvi_alarm.alarm_id)
nfvi_alarms.append(nfvi_alarm)
self.strategy.nfvi_alarms = nfvi_alarms
if self._fail_on_alarms and self.strategy.nfvi_alarms:
result = strategy.STRATEGY_STEP_RESULT.FAILED
alarm_ids = [str(alarm.get('alarm_id')) for alarm in self.strategy.nfvi_alarms]
reason = "alarms %s from %s are present" % (alarm_ids, fm_service)
if self.strategy.nfvi_alarms:
result = (
strategy.STRATEGY_STEP_RESULT.FAILED
if self._fail_on_alarms
else strategy.STRATEGY_STEP_RESULT.SUCCESS
)
reason = (
f"Unignored {fm_service} alarms are present: "
f"{json.dumps([v.as_dict() for v in self.strategy.nfvi_alarms], indent=2)}"
)
else:
result = strategy.STRATEGY_STEP_RESULT.SUCCESS
reason = ""
reason = "No unignored alarms present"
self.stage.step_complete(result, reason)
else:
@ -2631,24 +2635,22 @@ class WaitDataSyncStep(strategy.StrategyStep):
if self.strategy is not None:
nfvi_alarms = list()
for nfvi_alarm in response['result-data']:
if (self.strategy._alarm_restrictions ==
if nfvi_alarm.alarm_id in self._ignore_alarms:
DLOG.info("Strategy ignores alarm: id=%s, uuid=%s" %
(nfvi_alarm.alarm_id, nfvi_alarm.alarm_uuid))
elif (self.strategy._alarm_restrictions ==
strategy.STRATEGY_ALARM_RESTRICTION_TYPES.RELAXED and
nfvi_alarm.mgmt_affecting == 'False'):
DLOG.warn("Ignoring non-management affecting alarm "
"%s - uuid %s due to relaxed alarm "
"strictness" % (nfvi_alarm.alarm_id,
nfvi_alarm.alarm_uuid))
DLOG.warn("Relaxed mode ignores alarm: "
"id=%s, uuid=%s" % (nfvi_alarm.alarm_id,
nfvi_alarm.alarm_uuid))
elif (self.strategy._alarm_restrictions ==
strategy.STRATEGY_ALARM_RESTRICTION_TYPES.PERMISSIVE):
DLOG.warn("Ignoring alarm "
"%s - uuid %s due to permissive alarm "
"strictness" % (nfvi_alarm.alarm_id,
nfvi_alarm.alarm_uuid))
elif nfvi_alarm.alarm_id not in self._ignore_alarms:
nfvi_alarms.append(nfvi_alarm)
DLOG.warn("Permissive mode ignores alarm: "
"id=%s, uuid=%s" % (nfvi_alarm.alarm_id,
nfvi_alarm.alarm_uuid))
else:
DLOG.debug("Ignoring alarm %s - uuid %s" %
(nfvi_alarm.alarm_id, nfvi_alarm.alarm_uuid))
nfvi_alarms.append(nfvi_alarm)
self.strategy.nfvi_alarms = nfvi_alarms
if self.strategy.nfvi_alarms:
@ -2749,20 +2751,21 @@ class WaitAlarmsClearStep(strategy.StrategyStep):
if self.strategy is not None:
nfvi_alarms = list()
for nfvi_alarm in response['result-data']:
if (self.strategy._alarm_restrictions ==
if nfvi_alarm.alarm_id in self._ignore_alarms:
DLOG.info("Strategy ignores alarm: id=%s, uuid=%s" %
(nfvi_alarm.alarm_id, nfvi_alarm.alarm_uuid))
elif (self.strategy._alarm_restrictions ==
strategy.STRATEGY_ALARM_RESTRICTION_TYPES.RELAXED and
nfvi_alarm.mgmt_affecting == 'False'):
DLOG.warn("Ignoring non-management affecting alarm "
"%s - uuid %s due to relaxed alarm "
"strictness" % (nfvi_alarm.alarm_id,
nfvi_alarm.alarm_uuid))
DLOG.warn("Relaxed mode ignores alarm: "
"id=%s, uuid=%s" % (nfvi_alarm.alarm_id,
nfvi_alarm.alarm_uuid))
elif (self.strategy._alarm_restrictions ==
strategy.STRATEGY_ALARM_RESTRICTION_TYPES.PERMISSIVE):
DLOG.warn("Ignoring alarm "
"%s - uuid %s due to permissive alarm "
"strictness" % (nfvi_alarm.alarm_id,
nfvi_alarm.alarm_uuid))
elif nfvi_alarm.alarm_id not in self._ignore_alarms:
DLOG.warn("Permissive mode ignores alarm: "
"id=%s, uuid=%s" % (nfvi_alarm.alarm_id,
nfvi_alarm.alarm_uuid))
else:
# For ignoring stale alarm(currently 750.006)
if nfvi_alarm.alarm_id in self._ignore_alarms_conditional:
format_string = "%Y-%m-%dT%H:%M:%S.%f"
@ -2784,10 +2787,6 @@ class WaitAlarmsClearStep(strategy.StrategyStep):
else:
nfvi_alarms.append(nfvi_alarm)
nfvi_alarms.append(nfvi_alarm)
else:
DLOG.debug("Ignoring alarm %s - uuid %s" %
(nfvi_alarm.alarm_id, nfvi_alarm.alarm_uuid))
self.strategy.nfvi_alarms = nfvi_alarms
if self.strategy.nfvi_alarms:
@ -2798,9 +2797,15 @@ class WaitAlarmsClearStep(strategy.StrategyStep):
# Removes only the alarm which has
# not yet reached specified timeout.
self.strategy.nfvi_alarms.remove(alarm)
# Keep waiting for alarms to clear
pass
else:
for v in self.strategy.nfvi_alarms:
DLOG.info(
"Waiting for unignored alarm to clear "
f"id={v.alarm_id}, uuid={v.alarm_uuid}"
)
# Do not use elif, nfvi_alarms can be modified during previous block
if not self.strategy.nfvi_alarms:
# Alarms have all cleared
result = strategy.STRATEGY_STEP_RESULT.SUCCESS
self.stage.step_complete(result, "")
@ -2864,6 +2869,18 @@ class WaitAlarmsClearStep(strategy.StrategyStep):
data['ignore_alarms_conditional'] = self._ignore_alarms_conditional
return data
def timeout(self):
"""
Strategy Step Timeout Override
"""
result, _ = super(WaitAlarmsClearStep, self).timeout()
reason = (
"Unignored alarms did not clear before timeout: "
f"{json.dumps([v.as_dict() for v in self.strategy.nfvi_alarms], indent=2)}"
)
return result, reason
class QuerySwPatchesStep(strategy.StrategyStep):
"""