Merge "nova: improve compute service registration failure handling"
This commit is contained in:
commit
1de1e0f36c
@ -487,6 +487,12 @@ nova_compute_startup_delay: 30
|
||||
# nova_cell_conductor_has_api_database to no.
|
||||
nova_cell_conductor_has_api_database: "yes"
|
||||
|
||||
# Whether the failure of a nova-compute service to register itself is fatal to
|
||||
# the Kolla Ansible run. This is evaluated on a per-cell basis. Default
|
||||
# behaviour is to only fail the host on which the compute service failed to
|
||||
# register itself.
|
||||
nova_compute_registration_fatal: false
|
||||
|
||||
####################
|
||||
# Notification
|
||||
####################
|
||||
|
@ -16,7 +16,7 @@
|
||||
- name: Flush handlers
|
||||
meta: flush_handlers
|
||||
|
||||
- include_tasks: discover_computes.yml
|
||||
- import_tasks: wait_discover_computes.yml
|
||||
vars:
|
||||
# List of virtualised compute hypervisors in this Ansible play batch.
|
||||
virt_computes_in_batch: >-
|
||||
@ -34,5 +34,4 @@
|
||||
# Run discovery when one or more compute hosts are in the Ansible batch,
|
||||
# and there is a cell conductor in the inventory to delegate to.
|
||||
- all_computes_in_batch | length > 0
|
||||
- inventory_hostname == all_computes_in_batch[0]
|
||||
- groups[nova_cell_conductor_group] | length > 0
|
||||
|
@ -1,77 +1,21 @@
|
||||
---
|
||||
# We need to wait for all expected compute services to register before running
|
||||
# cells v2 host discovery. This includes virtualised compute services and
|
||||
# ironic compute services.
|
||||
# Work with --limit by including only hosts in ansible_play_batch.
|
||||
- name: Build a list of expected compute service hosts
|
||||
vars:
|
||||
# For virt, use ansible_facts.nodename rather than inventory_hostname, since this
|
||||
# is similar to what nova uses internally as its default for the
|
||||
# [DEFAULT] host config option.
|
||||
virt_compute_service_hosts: >-
|
||||
{{ virt_computes_in_batch |
|
||||
map('extract', hostvars, ['ansible_facts', 'nodename']) |
|
||||
list }}
|
||||
# For ironic, use {{ansible_facts.hostname}}-ironic since this is what we
|
||||
# configure for [DEFAULT] host in nova.conf.
|
||||
ironic_compute_service_hosts: >-
|
||||
{{ ironic_computes_in_batch |
|
||||
map('extract', hostvars, ['ansible_facts', 'hostname']) |
|
||||
map('regex_replace', '^(.*)$', '\1-ironic') |
|
||||
list }}
|
||||
set_fact:
|
||||
expected_compute_service_hosts: "{{ virt_compute_service_hosts + ironic_compute_service_hosts }}"
|
||||
delegate_to: "{{ groups[nova_cell_conductor_group][0] }}"
|
||||
# Discover compute hosts for a cell.
|
||||
|
||||
- name: Waiting for nova-compute services to register themselves
|
||||
become: true
|
||||
command: >
|
||||
docker exec kolla_toolbox openstack
|
||||
--os-interface {{ openstack_interface }}
|
||||
--os-auth-url {{ openstack_auth.auth_url }}
|
||||
--os-username {{ openstack_auth.username }}
|
||||
--os-password {{ openstack_auth.password }}
|
||||
--os-identity-api-version 3
|
||||
--os-user-domain-name {{ openstack_auth.user_domain_name }}
|
||||
--os-system-scope {{ openstack_auth.system_scope }}
|
||||
--os-region-name {{ openstack_region_name }}
|
||||
{% if openstack_cacert != '' %}--os-cacert {{ openstack_cacert }}{% endif %}
|
||||
compute service list --format json --column Host --service nova-compute
|
||||
register: nova_compute_services
|
||||
changed_when: false
|
||||
retries: 20
|
||||
delay: 10
|
||||
until:
|
||||
- nova_compute_services is success
|
||||
# A list containing the 'Host' field of compute services that have
|
||||
# registered themselves. Don't exclude compute services that are disabled
|
||||
# since these could have been explicitly disabled by the operator. While we
|
||||
# could exclude services that are down, the nova-manage cell_v2
|
||||
# discover_hosts does not do this so let's not block on it here.
|
||||
# NOTE(mgoddard): Cannot factor this out into an intermediary variable
|
||||
# before ansible 2.8, due to
|
||||
# https://bugs.launchpad.net/kolla-ansible/+bug/1835817.
|
||||
- (nova_compute_services.stdout |
|
||||
from_json |
|
||||
map(attribute='Host') |
|
||||
list)
|
||||
is superset(expected_compute_service_hosts)
|
||||
delegate_to: "{{ groups[nova_cell_conductor_group][0] }}"
|
||||
- block:
|
||||
- import_tasks: get_cell_settings.yml
|
||||
|
||||
- import_tasks: get_cell_settings.yml
|
||||
delegate_to: "{{ groups[nova_cell_conductor_group][0] }}"
|
||||
- name: Fail if cell settings not found
|
||||
fail:
|
||||
msg: >-
|
||||
Unable to find settings for {{ nova_cell_name or 'the default cell' }}.
|
||||
when: not nova_cell_settings
|
||||
|
||||
- name: Fail if cell settings not found
|
||||
fail:
|
||||
msg: >-
|
||||
Unable to find settings for {{ nova_cell_name or 'the default cell' }}.
|
||||
when: not nova_cell_settings
|
||||
delegate_to: "{{ groups[nova_cell_conductor_group][0] }}"
|
||||
# TODO(yoctozepto): no need to do --by-service if ironic not used
|
||||
- name: Discover nova hosts
|
||||
become: true
|
||||
command: >
|
||||
docker exec nova_conductor nova-manage cell_v2 discover_hosts --by-service --cell_uuid {{ nova_cell_settings.cell_uuid }}
|
||||
changed_when: False
|
||||
|
||||
# TODO(yoctozepto): no need to do --by-service if ironic not used
|
||||
- name: Discover nova hosts
|
||||
become: true
|
||||
command: >
|
||||
docker exec nova_conductor nova-manage cell_v2 discover_hosts --by-service --cell_uuid {{ nova_cell_settings.cell_uuid }}
|
||||
changed_when: False
|
||||
# Delegate to a cell conductor.
|
||||
delegate_to: "{{ groups[nova_cell_conductor_group][0] }}"
|
||||
|
88
ansible/roles/nova-cell/tasks/wait_discover_computes.yml
Normal file
88
ansible/roles/nova-cell/tasks/wait_discover_computes.yml
Normal file
@ -0,0 +1,88 @@
|
||||
---
|
||||
# We need to wait for all expected compute services to register before running
|
||||
# cells v2 host discovery. This includes virtualised compute services and
|
||||
# ironic compute services.
|
||||
# Work with --limit by including only hosts in ansible_play_batch.
|
||||
|
||||
- block:
|
||||
- name: Waiting for nova-compute services to register themselves
|
||||
become: true
|
||||
command: >
|
||||
docker exec kolla_toolbox openstack
|
||||
--os-interface {{ openstack_interface }}
|
||||
--os-auth-url {{ openstack_auth.auth_url }}
|
||||
--os-username {{ openstack_auth.username }}
|
||||
--os-password {{ openstack_auth.password }}
|
||||
--os-identity-api-version 3
|
||||
--os-user-domain-name {{ openstack_auth.user_domain_name }}
|
||||
--os-system-scope {{ openstack_auth.system_scope }}
|
||||
--os-region-name {{ openstack_region_name }}
|
||||
{% if openstack_cacert != '' %}--os-cacert {{ openstack_cacert }}{% endif %}
|
||||
compute service list --format json --column Host --service nova-compute
|
||||
register: nova_compute_services
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
retries: 20
|
||||
delay: 10
|
||||
until:
|
||||
- nova_compute_services is success
|
||||
# A list containing the 'Host' field of compute services that have
|
||||
# registered themselves. Don't exclude compute services that are disabled
|
||||
# since these could have been explicitly disabled by the operator. While we
|
||||
# could exclude services that are down, the nova-manage cell_v2
|
||||
# discover_hosts does not do this so let's not block on it here.
|
||||
- (nova_compute_services.stdout |
|
||||
from_json |
|
||||
map(attribute='Host') |
|
||||
list)
|
||||
is superset(expected_compute_service_hosts)
|
||||
# Execute on one compute per cell, and delegate to a cell conductor.
|
||||
when: inventory_hostname == all_computes_in_batch[0]
|
||||
delegate_to: "{{ groups[nova_cell_conductor_group][0] }}"
|
||||
|
||||
# NOTE(mgoddard): Use a separate fail task to ensure we fail only those hosts
|
||||
# that failed to register.
|
||||
- name: Fail if nova-compute service failed to register
|
||||
vars:
|
||||
# 'Host' field of all registered compute services.
|
||||
nova_compute_service_hosts: >-
|
||||
{{ hostvars[all_computes_in_batch[0]].nova_compute_services.stdout |
|
||||
from_json |
|
||||
map(attribute='Host') |
|
||||
list }}
|
||||
# 'Host' field of failed compute services.
|
||||
failed_compute_service_hosts: >-
|
||||
{{ expected_compute_service_hosts | difference(nova_compute_service_hosts) | list }}
|
||||
# Whether any compute services failed on this host.
|
||||
any_failed_services: >-
|
||||
{{ ansible_facts.nodename in failed_compute_service_hosts or
|
||||
(ansible_facts.hostname ~ "-ironic") in failed_compute_service_hosts }}
|
||||
fail:
|
||||
msg: >-
|
||||
The Nova compute service failed to register itself on the following
|
||||
hosts: {{ failed_compute_service_hosts | join(',') }}
|
||||
when: >-
|
||||
any_failed_services or
|
||||
(nova_compute_registration_fatal | bool and
|
||||
failed_compute_service_hosts | length > 0)
|
||||
vars:
|
||||
# For virt, use ansible_facts.nodename rather than inventory_hostname, since this
|
||||
# is similar to what nova uses internally as its default for the
|
||||
# [DEFAULT] host config option.
|
||||
virt_compute_service_hosts: >-
|
||||
{{ virt_computes_in_batch |
|
||||
map('extract', hostvars, ['ansible_facts', 'nodename']) |
|
||||
list }}
|
||||
# For ironic, use {{ansible_facts.hostname}}-ironic since this is what we
|
||||
# configure for [DEFAULT] host in nova.conf.
|
||||
ironic_compute_service_hosts: >-
|
||||
{{ ironic_computes_in_batch |
|
||||
map('extract', hostvars, ['ansible_facts', 'hostname']) |
|
||||
map('regex_replace', '^(.*)$', '\1-ironic') |
|
||||
list }}
|
||||
expected_compute_service_hosts: "{{ virt_compute_service_hosts + ironic_compute_service_hosts }}"
|
||||
|
||||
- name: Include discover_computes.yml
|
||||
include_tasks: discover_computes.yml
|
||||
# Execute on one compute host per cell.
|
||||
when: inventory_hostname == all_computes_in_batch[0]
|
@ -65,3 +65,17 @@ concept known as Vendordata. If a Vendordata file is located in the
|
||||
following path within the Kolla configuration, Kolla will
|
||||
automatically use it when the Nova service is deployed or
|
||||
reconfigured: ``/etc/kolla/config/nova/vendordata.json``.
|
||||
|
||||
Failure handling
|
||||
================
|
||||
|
||||
Compute service registration
|
||||
----------------------------
|
||||
|
||||
During deployment, Kolla Ansible waits for Nova compute services to register
|
||||
themselves. By default, if a compute service does not register itself before
|
||||
the timeout, that host will be marked as failed in the Ansible run. This
|
||||
behaviour is useful at scale, where failures are more frequent.
|
||||
|
||||
Alternatively, to fail all hosts in a cell when any compute service fails
|
||||
to register, set ``nova_compute_registration_fatal`` to ``true``.
|
||||
|
@ -0,0 +1,9 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Fixes an issue where a failure of any Nova compute service to register
|
||||
itself would cause only the host querying the nova API to fail.
|
||||
Now, only hosts that fail to register will fail the Kolla Ansible run.
|
||||
Alternatively, to fail all hosts in a cell when any compute service fails
|
||||
to register, set ``nova_compute_registration_fatal`` to ``true``.
|
||||
`LP#1940119 <https://bugs.launchpad.net/kolla-ansible/+bug/1940119>`__
|
Loading…
x
Reference in New Issue
Block a user