Make VIM tolerant of compute service failures
When the VIM detects the nova compute service is down on a worker host, it attempts to migrate instances off that host (by "disabling" the host). However, this isn't possible if the compute service is down. The VIM then fails the instances, which will eventually result in their evacuation (if the host goes offline) or a reboot of the instance (if the compute service recovers). In the containers world, when the libvirt pod is restarted (e.g. when stx-openstack application is re-applied), nova reports that the compute service is down (for a short period of time), which causes the undesirable behaviour described above. The VIM is being updated to not disable the host in this case and instead just raise an alarm to indicate that the compute service has failed. Change-Id: I186d8d76bbcd87405bafec47deb92ec24580640e Closes-Bug: 1833096 Signed-off-by: Bart Wensley <barton.wensley@windriver.com>
This commit is contained in:
parent
f98b388a74
commit
a9004988dc
@ -53,8 +53,12 @@ class EnabledState(state_machine.State):
|
||||
return HOST_STATE.DISABLING
|
||||
|
||||
elif HOST_EVENT.TASK_COMPLETED == event:
|
||||
# Do not disable this host if only the compute service is disabled.
|
||||
# We will raise an alarm, but there is no way to safely move work
|
||||
# off the host if the compute service is down.
|
||||
if objects.HOST_SERVICE_STATE.ENABLED != \
|
||||
host.host_service_state_aggregate():
|
||||
host.host_service_state_aggregate(
|
||||
ignore_services=[objects.HOST_SERVICES.COMPUTE]):
|
||||
if not host.host_services_locked:
|
||||
DLOG.info("Host services are not enabled on %s. "
|
||||
"Disabling host." % host.name)
|
||||
@ -62,6 +66,7 @@ class EnabledState(state_machine.State):
|
||||
else:
|
||||
DLOG.info("Host services are not enabled on %s. "
|
||||
"Host services are locked." % host.name)
|
||||
|
||||
elif HOST_EVENT.TASK_FAILED == event:
|
||||
DLOG.info("Audit failed for %s." % host.name)
|
||||
|
||||
|
@ -185,10 +185,12 @@ class Host(ObjectData):
|
||||
"""
|
||||
return self._host_service_state[service]
|
||||
|
||||
def host_service_state_aggregate(self):
|
||||
def host_service_state_aggregate(self, ignore_services=None):
|
||||
"""
|
||||
Returns the overall state of the host services
|
||||
"""
|
||||
if ignore_services is None:
|
||||
ignore_services = []
|
||||
all_enabled = True
|
||||
at_least_one_failed = False
|
||||
for service, service_state in self._host_service_state.items():
|
||||
@ -196,6 +198,9 @@ class Host(ObjectData):
|
||||
# there is no query function for that sevice.
|
||||
if service == HOST_SERVICES.CONTAINER:
|
||||
continue
|
||||
# Ignore services we were told to ignore
|
||||
if service in ignore_services:
|
||||
continue
|
||||
all_enabled = all_enabled and \
|
||||
(service_state == HOST_SERVICE_STATE.ENABLED)
|
||||
at_least_one_failed = at_least_one_failed or \
|
||||
@ -758,30 +763,39 @@ class Host(ObjectData):
|
||||
|
||||
if service is not None:
|
||||
if host_service_state == self._host_service_state[service]:
|
||||
# No change to the state of the service
|
||||
return
|
||||
|
||||
self._host_service_state[service] = host_service_state
|
||||
|
||||
# Host services logs and alarms only apply to worker hosts
|
||||
if 'worker' in self.personality:
|
||||
host_service_state_overall = \
|
||||
self.host_service_state_aggregate()
|
||||
if (HOST_SERVICE_STATE.ENABLED ==
|
||||
host_service_state_overall):
|
||||
# Host services logs and alarms only apply to the compute service on
|
||||
# worker hosts
|
||||
if 'worker' in self.personality and HOST_SERVICES.COMPUTE == service:
|
||||
if HOST_SERVICE_STATE.ENABLED == host_service_state:
|
||||
self._events = event_log.host_issue_log(
|
||||
self, event_log.EVENT_ID.HOST_SERVICES_ENABLED)
|
||||
alarm.host_clear_alarm(self._alarms)
|
||||
self._alarms[:] = list()
|
||||
|
||||
elif (HOST_SERVICE_STATE.DISABLED ==
|
||||
host_service_state_overall):
|
||||
elif HOST_SERVICE_STATE.DISABLED == host_service_state:
|
||||
# Always log the disabled compute service
|
||||
self._events = event_log.host_issue_log(
|
||||
self, event_log.EVENT_ID.HOST_SERVICES_DISABLED)
|
||||
# Clear any previous alarms for this host
|
||||
alarm.host_clear_alarm(self._alarms)
|
||||
self._alarms[:] = list()
|
||||
# Alarm the disabled compute service if the host is still
|
||||
# enabled and is not being locked. Alarm it as a failure.
|
||||
if self.nfvi_host_is_enabled():
|
||||
if reason is None:
|
||||
additional_text = ''
|
||||
else:
|
||||
additional_text = ", %s" % reason
|
||||
self._alarms = alarm.host_raise_alarm(
|
||||
self, alarm.ALARM_TYPE.HOST_SERVICES_FAILED,
|
||||
additional_text=additional_text)
|
||||
|
||||
elif (HOST_SERVICE_STATE.FAILED ==
|
||||
host_service_state_overall):
|
||||
elif HOST_SERVICE_STATE.FAILED == host_service_state:
|
||||
if reason is None:
|
||||
additional_text = ''
|
||||
else:
|
||||
@ -790,6 +804,10 @@ class Host(ObjectData):
|
||||
self._events = event_log.host_issue_log(
|
||||
self, event_log.EVENT_ID.HOST_SERVICES_FAILED,
|
||||
additional_text=additional_text)
|
||||
# Clear any previous alarms for this host
|
||||
alarm.host_clear_alarm(self._alarms)
|
||||
self._alarms[:] = list()
|
||||
# Alarm the failed compute service
|
||||
self._alarms = alarm.host_raise_alarm(
|
||||
self, alarm.ALARM_TYPE.HOST_SERVICES_FAILED,
|
||||
additional_text=additional_text)
|
||||
|
Loading…
x
Reference in New Issue
Block a user