Merge "Rework sw-deploy-host logic and error messaging"

This commit is contained in:
Zuul 2024-08-19 18:53:25 +00:00 committed by Gerrit Code Review
commit 719621fe6e
10 changed files with 236 additions and 101 deletions

View File

@ -2367,8 +2367,9 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
elif not error_msg:
error_msg = f"Caught exception while trying to query deployment info, error={e}"
response["error-message"] = error_msg
DLOG.exception(error_msg)
if error_msg:
response["error-message"] = error_msg.strip()
DLOG.exception(error_msg)
except Exception as e:
error_msg = f"Caught exception while trying to query deployment info, error={e}"
@ -2429,12 +2430,15 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
"Unknown error while trying software deploy precheck, "
"check /var/log/nfv-vim.log or /var/log/software.log for more information."
)
else:
error_msg = f"Software deploy precheck was rejected: {error_msg}"
elif not error_msg:
error_msg = f"Caught exception while trying software deploy precheck, error={e}"
response["error-message"] = error_msg.strip()
DLOG.exception(error_msg)
if error_msg:
response["error-message"] = error_msg.strip()
DLOG.exception(error_msg)
except Exception as e:
error_msg = f"Caught exception while trying software deploy precheck, error={e}"
@ -2504,12 +2508,15 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
"Unknown error while trying software deploy start, "
"check /var/log/nfv-vim.log or /var/log/software.log for more information."
)
else:
error_msg = f"Software deploy start was rejected: {error_msg}"
elif not error_msg:
error_msg = f"Caught exception while trying software deploy start, error={e}"
response["error-message"] = error_msg.strip()
DLOG.exception(error_msg)
if error_msg:
response["error-message"] = error_msg.strip()
DLOG.exception(error_msg)
except Exception as e:
error_msg = f"Caught exception while trying software deploy start, error={e}"
@ -2581,12 +2588,15 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
"Unknown error while trying software deploy activate, "
"check /var/log/nfv-vim.log or /var/log/software.log for more information."
)
else:
error_msg = f"Software deploy activate was rejected: {error_msg}"
elif not error_msg:
error_msg = f"Caught exception while trying software deploy activate, error={e}"
response["error-message"] = error_msg.strip()
DLOG.exception(error_msg)
if error_msg:
response["error-message"] = error_msg.strip()
DLOG.exception(error_msg)
except Exception as e:
error_msg = f"Caught exception while trying software deploy activate, error={e}"
@ -2657,12 +2667,15 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
"Unknown error while trying software deploy complete, "
"check /var/log/nfv-vim.log or /var/log/software.log for more information."
)
else:
error_msg = f"Software deploy complete was rejected: {error_msg}"
elif not error_msg:
error_msg = f"Caught exception while trying software deploy complete, error={e}"
response["error-message"] = error_msg.strip()
DLOG.exception(error_msg)
if error_msg:
response["error-message"] = error_msg.strip()
DLOG.exception(error_msg)
except Exception as e:
error_msg = f"Caught exception while trying software deploy complete, error={e}"
@ -3841,22 +3854,23 @@ class NFVIInfrastructureAPI(nfvi.api.v1.NFVIInfrastructureAPI):
except exceptions.OpenStackRestAPIException as e:
x = json.loads(e.http_response_body)
msg = x.get("error", x.get("info"))
response["error-message"] = msg.strip()
error_msg = x.get("error", x.get("info"))
if httplib.UNAUTHORIZED == e.http_status_code:
response['error-code'] = nfvi.NFVI_ERROR_CODE.TOKEN_EXPIRED
if self._platform_token is not None:
self._platform_token.set_expired()
else:
DLOG.exception("Caught exception while trying to upgrade "
"a host %s, error=%s." % (host_name, e))
response['reason'] = e.http_response_reason
response["error-message"] = msg.strip()
elif not error_msg:
error_msg = f"Caught exception while trying software deploy host {host_name}, error={e}"
if error_msg:
response["error-message"] = error_msg.strip()
DLOG.exception(error_msg)
except Exception as e:
DLOG.exception("Caught exception while trying to upgrade a "
"host %s, error=%s." % (host_name, e))
error_msg = f"Caught exception while trying software deploy host {host_name}, error={e}"
response["error-message"] = error_msg
DLOG.exception(error_msg)
finally:
callback.send(response)

View File

@ -162,29 +162,45 @@ def sw_deploy_get_upgrade_obj(token, release):
release_data = sw_deploy_get_releases(token).result_data
deploy_data = sw_deploy_show(token).result_data
hosts_info_data = sw_deploy_host_list(token).result_data
error_template = "{}, check /var/log/nfv-vim.log or /var/log/software.log for more information."
# Parse responses
for rel in release_data:
if release and rel['release_id'] == release:
release_info = rel
break
elif not release and rel['state'] == usm_states.DEPLOYING:
release = rel['release_id']
release_info = rel
break
try:
for rel in release_data:
if release and rel['release_id'] == release:
release_info = rel
break
elif not release and rel['state'] == usm_states.DEPLOYING:
release = rel['release_id']
release_info = rel
break
except Exception as e:
error = "Failed to parse 'software list'"
DLOG.exception(f"{error}: {release_data}")
raise ValueError(error_template.format(error)) from e
if not release_info:
if release:
error_msg = f"Software release not found: {release}"
error = f"Software release not found: {release}"
else:
error_msg = "Software release not found"
raise EnvironmentError(error_msg)
error = "Software release not found"
raise EnvironmentError(error)
if deploy_data:
deploy_info = deploy_data[0]
try:
if deploy_data:
deploy_info = deploy_data[0]
except Exception as e:
error = "Failed to parse 'software deploy show'"
DLOG.exception(f"{error}: {deploy_data}")
raise ValueError(error_template.format(error)) from e
if hosts_info_data:
hosts_info = hosts_info_data
try:
if hosts_info_data:
hosts_info = hosts_info_data
except Exception as e:
error = "Failed to parse 'software deploy host-list'"
DLOG.exception(f"{error}: {hosts_info_data}")
raise ValueError(error_template.format(error)) from e
upgrade_obj = nfvi.objects.v1.Upgrade(
release,

View File

@ -26,6 +26,8 @@ PATCH_RELEASE_UPGRADE = "3.2.2"
MINOR_RELEASE_UPGRADE = "4.0.1"
MAJOR_RELEASE_UPGRADE = "4.0.1"
DEPLOY_START_DELAY = 120
# utility method for the formatting of unlock-hosts stage as dict
# workers default to 5 retries with 120 second delay between attempts
@ -278,7 +280,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'start-upgrade',
'release': release},
{'name': 'system-stabilize',
'timeout': 15},
'timeout': DEPLOY_START_DELAY},
]
}
]
@ -326,7 +328,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'start-upgrade',
'release': release},
{'name': 'system-stabilize',
'timeout': 15},
'timeout': DEPLOY_START_DELAY},
]
}
]
@ -373,7 +375,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'start-upgrade',
'release': release},
{'name': 'system-stabilize',
'timeout': 15},
'timeout': DEPLOY_START_DELAY},
]
}
]
@ -423,7 +425,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'start-upgrade',
'release': release},
{'name': 'system-stabilize',
'timeout': 15},
'timeout': DEPLOY_START_DELAY},
]
}
]
@ -1224,7 +1226,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'steps': [
{'name': 'query-alarms'},
{'name': 'start-upgrade', 'release': release},
{'name': 'system-stabilize', 'timeout': 15},
{'name': 'system-stabilize', 'timeout': DEPLOY_START_DELAY},
],
},
{
@ -1291,7 +1293,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'steps': [
{'name': 'query-alarms'},
{'name': 'start-upgrade', 'release': release},
{'name': 'system-stabilize', 'timeout': 15},
{'name': 'system-stabilize', 'timeout': DEPLOY_START_DELAY},
],
},
{
@ -1362,7 +1364,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'steps': [
{'name': 'query-alarms'},
{'name': 'start-upgrade', 'release': release},
{'name': 'system-stabilize', 'timeout': 15},
{'name': 'system-stabilize', 'timeout': DEPLOY_START_DELAY},
],
},
{
@ -1438,7 +1440,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'steps': [
{'name': 'query-alarms'},
{'name': 'start-upgrade', 'release': release},
{'name': 'system-stabilize', 'timeout': 15},
{'name': 'system-stabilize', 'timeout': DEPLOY_START_DELAY},
],
},
{
@ -1526,7 +1528,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'steps': [
{'name': 'query-alarms'},
{'name': 'start-upgrade', 'release': release},
{'name': 'system-stabilize', 'timeout': 15},
{'name': 'system-stabilize', 'timeout': DEPLOY_START_DELAY},
],
},
{
@ -1624,7 +1626,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
'steps': [
{'name': 'query-alarms'},
{'name': 'start-upgrade', 'release': release},
{'name': 'system-stabilize', 'timeout': 15},
{'name': 'system-stabilize', 'timeout': DEPLOY_START_DELAY},
],
},
{
@ -1746,7 +1748,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'swact-hosts',
'entity_names': ['controller-1']},
{'name': 'start-upgrade', 'release': release},
{'name': 'system-stabilize', 'timeout': 15},
{'name': 'system-stabilize', 'timeout': DEPLOY_START_DELAY},
],
},
{

View File

@ -36,6 +36,8 @@ IGNORE_ALARMS_LIST = [
"900.231",
]
DEPLOY_START_DELAY = 120
# TODO(jkraitbe): Update this when retry count is decicded.
# utility method for the formatting of unlock-hosts stage as dict
@ -1432,7 +1434,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'start-upgrade',
'release': strategy.nfvi_upgrade.release},
{'name': 'system-stabilize',
'timeout': 15},
'timeout': DEPLOY_START_DELAY},
]
},
{'name': 'sw-upgrade-controllers',
@ -1602,7 +1604,7 @@ class TestSwUpgradeStrategy(sw_update_testcase.SwUpdateStrategyTestCase):
{'name': 'start-upgrade',
'release': strategy.nfvi_upgrade.release},
{'name': 'system-stabilize',
'timeout': 15},
'timeout': DEPLOY_START_DELAY},
]
},
{'name': 'sw-upgrade-controllers',

View File

@ -273,27 +273,37 @@ class HostDirector(object):
response = (yield)
DLOG.verbose("NFVI Upgrade Host callback response=%s." % response)
host_table = tables.tables_get_host_table()
host = host_table.get(response['host_name'], None)
if host is None:
DLOG.verbose("Host %s does not exist." % response['host_name'])
return
if self._host_operation is None:
DLOG.verbose("No host %s operation in progress." % host.name)
return
if OPERATION_TYPE.UPGRADE_HOSTS != self._host_operation.operation_type:
DLOG.verbose("Unexpected host %s operation %s, ignoring."
% (host.name, self._host_operation.operation_type))
return
if not response['completed']:
DLOG.info("Upgrade of host %s failed, reason=%s."
% (response['host_name'], response.get('error-message', response['reason'])))
result = {"host": host, "error-message": response["error-message"]}
sw_mgmt_director = directors.get_sw_mgmt_director()
sw_mgmt_director.host_upgrade_failed(result)
else:
DLOG.info("Upgrade of host %s succeeded, reason=%s."
% (response['host_name'], response['reason']))
host_table = tables.tables_get_host_table()
host = host_table.get(response['host_name'], None)
if host is None:
DLOG.verbose("Host %s does not exist." % response['host_name'])
return
if self._host_operation is None:
DLOG.verbose("No host %s operation in progress." % host.name)
return
if OPERATION_TYPE.UPGRADE_HOSTS != self._host_operation.operation_type:
DLOG.verbose("Unexpected host %s operation %s, ignoring."
% (host.name, self._host_operation.operation_type))
return
result = {"host": host}
sw_mgmt_director = directors.get_sw_mgmt_director()
sw_mgmt_director.host_upgrade_failed(host)
sw_mgmt_director.host_upgrade_changed(result)
def _nfvi_upgrade_host(self, host_uuid, host_name):
"""

View File

@ -351,13 +351,21 @@ class SwMgmtDirector(object):
self._sw_update.handle_event(
strategy.STRATEGY_EVENT.HOST_SWACT_FAILED, host)
def host_upgrade_failed(self, host):
def host_upgrade_failed(self, result):
"""
Called when an upgrade of a host failed
"""
if self._sw_update is not None:
self._sw_update.handle_event(
strategy.STRATEGY_EVENT.HOST_UPGRADE_FAILED, host)
strategy.STRATEGY_EVENT.HOST_UPGRADE_FAILED, result)
def host_upgrade_changed(self, result):
"""
Called when an upgrade of a host succeeded
"""
if self._sw_update is not None:
self._sw_update.handle_event(
strategy.STRATEGY_EVENT.HOST_UPGRADE_CHANGED, result)
def host_fw_update_abort_failed(self, host):
"""

View File

@ -141,6 +141,13 @@ class Upgrade(ObjectData):
def is_deploy_completed(self):
return self.deploy_state == usm_states.DEPLOY_STATES.COMPLETED.value
@property
def host_states(self):
return {
v["hostname"]: v["host_state"]
for v in self.hosts_info
}
def is_host_deployed(self, hostname):
if not self.hosts_info:
return None

View File

@ -47,6 +47,8 @@ STRATEGY_NAME = StrategyNames()
MTCE_DELAY = 15
# a no-reboot patch can stabilize in 30 seconds
NO_REBOOT_DELAY = 30
# How long to wait after deploy-start-done
DEPLOY_START_DONE_DELAY = 120
# constants used by the patching API for state and repo state
PATCH_REPO_STATE_APPLIED = 'Applied'
@ -1902,7 +1904,8 @@ class SwUpgradeStrategy(
# sw-deploy start for major releases must be done on controller-0
self._swact_fix(stage, HOST_NAME.CONTROLLER_1)
stage.add_step(strategy.UpgradeStartStep(release=self._release))
stage.add_step(strategy.SystemStabilizeStep(timeout_in_secs=MTCE_DELAY))
# There can be alarms related to CPU/memory/disk usage after start
stage.add_step(strategy.SystemStabilizeStep(DEPLOY_START_DONE_DELAY))
self.apply_phase.add_stage(stage)
def _add_upgrade_hosts_stages(self):
@ -1918,7 +1921,7 @@ class SwUpgradeStrategy(
for host in host_table.values():
if self.nfvi_upgrade.is_host_deployed(host.name):
DLOG.info("Skipping deploy-host for already deployed host: {host.name}")
DLOG.info(f"Skipping deploy-host for already deployed host: {host.name}")
continue
if HOST_PERSONALITY.CONTROLLER in host.personality:

View File

@ -18,6 +18,7 @@ class EventNames(object):
HOST_UNLOCK_FAILED = Constant('host-unlock-failed')
HOST_REBOOT_FAILED = Constant('host-reboot-failed')
HOST_UPGRADE_FAILED = Constant('host-upgrade-failed')
HOST_UPGRADE_CHANGED = Constant('host-upgrade-changed')
HOST_FW_UPDATE_FAILED = Constant('host-fw-update-failed')
HOST_FW_UPDATE_ABORT_FAILED = Constant('host-fw-update-abort-failed')
HOST_SWACT_FAILED = Constant('host-swact-failed')

View File

@ -3,6 +3,7 @@
#
# SPDX-License-Identifier: Apache-2.0
#
import json
import six
from nfv_common import debug
@ -16,6 +17,7 @@ from nfv_vim import objects
from nfv_vim.strategy._strategy_defs import FW_UPDATE_LABEL
from nfv_vim.strategy._strategy_defs import STRATEGY_EVENT
from nfv_vim import tables
import software.states as usm_states
DLOG = debug.debug_get_logger('nfv_vim.strategy.step')
@ -24,11 +26,6 @@ KUBE_CERT_UPDATE_TRUSTBOTHCAS = "trust-both-cas"
KUBE_CERT_UPDATE_TRUSTNEWCA = "trust-new-ca"
KUBE_CERT_UPDATE_UPDATECERTS = "update-certs"
# sw-deploy strategy constants
SW_DEPLOY_START = 'start-done'
SW_HOST_DEPLOYED = 'deployed'
SW_DEPLOY_ACTIVATE_DONE = 'activate-done'
@six.add_metaclass(Singleton)
class StrategyStepNames(Constants):
@ -1008,38 +1005,84 @@ class UpgradeHostsStep(strategy.StrategyStep):
for host in hosts:
self._host_names.append(host.name)
self._query_inprogress = False
self._step_complete = False
self._deployed_hosts = {}
self._failed_hosts = {}
self._unknown_hosts = 0
def _get_upgrade_callback_inner(self, response):
"""
Get Upgrade Callback
"""
if not response['completed']:
return False
self.strategy.nfvi_upgrade = response['result-data']
hosts_states = self.strategy.nfvi_upgrade.host_states
# This information is already in the response, but this adds an easy view.
response['hosts-states'] = hosts_states
completed_hosts = self._deployed_hosts.keys() | self._failed_hosts.keys()
missing_hosts = set(self._host_names) - completed_hosts
if len(missing_hosts) - self._unknown_hosts > 0:
# TODO(jkraitbe): Allow reason to be updated during STRATEGY_STEP_RESULT.WAIT
reason = f"Deploy hosts still in progress, waiting for: {missing_hosts}"
DLOG.error(reason)
return False
# Determine if any hosts failed and why
failed_hosts = {}
for v in self._host_names:
if v in self._deployed_hosts:
continue
fail_reason = None
if v not in hosts_states:
fail_reason = self._failed_hosts.get(v, "Missing host from software deploy host-list")
elif usm_states.DEPLOY_HOST_STATES.PENDING.value in hosts_states[v]:
fail_reason = self._failed_hosts.get(v, "Host was detected in pending state")
elif usm_states.DEPLOY_HOST_STATES.FAILED.value in hosts_states[v]:
fail_reason = self._failed_hosts.get(v, "Host was detected in failed state")
elif usm_states.DEPLOY_HOST_STATES.DEPLOYING.value in hosts_states[v]:
fail_reason = self._failed_hosts.get(v, "Host was still deploying when it was expected to be done")
elif usm_states.DEPLOY_HOST_STATES.DEPLOYED.value not in hosts_states[v]:
fail_reason = self._failed_hosts.get(v, f"Host was detected in invalid state: {hosts_states[v]}")
if fail_reason:
failed_hosts[v] = fail_reason
DLOG.error(f"{v}: {fail_reason}")
# # Wait for all hosts to be done transitioning before declaring pass/fail
if failed_hosts:
response['failed-hosts'] = failed_hosts
reason = f"Deploy hosts failed for some hosts: {json.dumps(failed_hosts, indent=2)}"
result = strategy.STRATEGY_STEP_RESULT.FAILED
detailed_reason = str(response)
self.phase.result_complete_response(detailed_reason)
self.stage.step_complete(result, reason)
return True
reason = "Deploy hosts succeeded for all hosts"
result = strategy.STRATEGY_STEP_RESULT.SUCCESS
DLOG.info(reason)
self.stage.step_complete(result, reason)
return True
@coroutine
def _get_upgrade_callback(self):
"""
Get Upgrade Callback
"""
response = (yield)
DLOG.debug("Query-Upgrade callback response=%s." % response)
self._query_inprogress = False
if response['completed']:
self.strategy.nfvi_upgrade = response['result-data']
host_count = 0
match_count = 0
host_info_list = self.strategy.nfvi_upgrade['hosts_info']
for host_name in self._host_names:
for host in host_info_list:
if (host_name == host['hostname']) and (host['host_state'] == SW_HOST_DEPLOYED):
match_count += 1
host_count += 1
if match_count == len(self._host_names):
result = strategy.STRATEGY_STEP_RESULT.SUCCESS
DLOG.info("Upgrade Hosts completed")
self.stage.step_complete(result, "")
else:
# keep waiting for Upgrade host state to change
pass
else:
result = strategy.STRATEGY_STEP_RESULT.FAILED
DLOG.info("Host Upgrade failed")
detailed_reason = str(response)
self.phase.result_complete_response(detailed_reason)
self.stage.step_complete(result, response['reason'])
try:
self._step_complete = self._get_upgrade_callback_inner(response)
finally:
self._query_inprogress = False
def apply(self):
"""
@ -1066,14 +1109,36 @@ class UpgradeHostsStep(strategy.StrategyStep):
DLOG.debug("Step (%s) handle event (%s)." % (self._name, event))
update = False
if event == STRATEGY_EVENT.HOST_UPGRADE_FAILED:
host = event_data
if host is not None and host.name in self._host_names:
result = strategy.STRATEGY_STEP_RESULT.FAILED
self.stage.step_complete(result, "host upgrade failed")
return True
host = event_data["host"]
if host and host.name in self._host_names:
error = f"Failed software deploy host {host.name}: {event_data['error-message']}"
self._failed_hosts[host.name] = error
DLOG.error(error)
else:
DLOG.error(f"Unknown software deploy host failed: {event_data}")
self._unknown_hosts += 1
update = True
elif event == STRATEGY_EVENT.HOST_UPGRADE_CHANGED:
host = event_data["host"]
if host and host.name in self._host_names:
self._deployed_hosts[host.name] = None
DLOG.info(f"Completed software deploy host {host.name}")
else:
DLOG.error(f"Unknown software deploy host completed: {event_data}")
self._unknown_hosts += 1
update = True
elif event in [STRATEGY_EVENT.HOST_AUDIT]:
update = True
if self._query_inprogress or self._step_complete:
return True
if update:
self._query_inprogress = True
release = self.strategy.nfvi_upgrade['release']
nfvi.nfvi_get_upgrade(release, self._get_upgrade_callback())
@ -1090,6 +1155,10 @@ class UpgradeHostsStep(strategy.StrategyStep):
self._host_uuids = list()
self._host_names = data['entity_names']
self._query_inprogress = False
self._step_complete = False
self._failed_hosts = data["failed_hosts"]
self._deployed_hosts = data["deployed_hosts"]
self._unknown_hosts = data["unknown_hosts"]
return self
def as_dict(self):
@ -1100,6 +1169,9 @@ class UpgradeHostsStep(strategy.StrategyStep):
data['entity_type'] = 'hosts'
data['entity_names'] = self._host_names
data['entity_uuids'] = self._host_uuids
data['failed_hosts'] = self._failed_hosts
data['deployed_hosts'] = self._deployed_hosts
data['unknown_hosts'] = self._unknown_hosts
return data