Merge "nova: improve compute service registration failure handling"
This commit is contained in:
commit
1de1e0f36c
@ -487,6 +487,12 @@ nova_compute_startup_delay: 30
|
|||||||
# nova_cell_conductor_has_api_database to no.
|
# nova_cell_conductor_has_api_database to no.
|
||||||
nova_cell_conductor_has_api_database: "yes"
|
nova_cell_conductor_has_api_database: "yes"
|
||||||
|
|
||||||
|
# Whether the failure of a nova-compute service to register itself is fatal to
|
||||||
|
# the Kolla Ansible run. This is evaluated on a per-cell basis. Default
|
||||||
|
# behaviour is to only fail the host on which the compute service failed to
|
||||||
|
# register itself.
|
||||||
|
nova_compute_registration_fatal: false
|
||||||
|
|
||||||
####################
|
####################
|
||||||
# Notification
|
# Notification
|
||||||
####################
|
####################
|
||||||
|
@ -16,7 +16,7 @@
|
|||||||
- name: Flush handlers
|
- name: Flush handlers
|
||||||
meta: flush_handlers
|
meta: flush_handlers
|
||||||
|
|
||||||
- include_tasks: discover_computes.yml
|
- import_tasks: wait_discover_computes.yml
|
||||||
vars:
|
vars:
|
||||||
# List of virtualised compute hypervisors in this Ansible play batch.
|
# List of virtualised compute hypervisors in this Ansible play batch.
|
||||||
virt_computes_in_batch: >-
|
virt_computes_in_batch: >-
|
||||||
@ -34,5 +34,4 @@
|
|||||||
# Run discovery when one or more compute hosts are in the Ansible batch,
|
# Run discovery when one or more compute hosts are in the Ansible batch,
|
||||||
# and there is a cell conductor in the inventory to delegate to.
|
# and there is a cell conductor in the inventory to delegate to.
|
||||||
- all_computes_in_batch | length > 0
|
- all_computes_in_batch | length > 0
|
||||||
- inventory_hostname == all_computes_in_batch[0]
|
|
||||||
- groups[nova_cell_conductor_group] | length > 0
|
- groups[nova_cell_conductor_group] | length > 0
|
||||||
|
@ -1,77 +1,21 @@
|
|||||||
---
|
---
|
||||||
# We need to wait for all expected compute services to register before running
|
# Discover compute hosts for a cell.
|
||||||
# cells v2 host discovery. This includes virtualised compute services and
|
|
||||||
# ironic compute services.
|
|
||||||
# Work with --limit by including only hosts in ansible_play_batch.
|
|
||||||
- name: Build a list of expected compute service hosts
|
|
||||||
vars:
|
|
||||||
# For virt, use ansible_facts.nodename rather than inventory_hostname, since this
|
|
||||||
# is similar to what nova uses internally as its default for the
|
|
||||||
# [DEFAULT] host config option.
|
|
||||||
virt_compute_service_hosts: >-
|
|
||||||
{{ virt_computes_in_batch |
|
|
||||||
map('extract', hostvars, ['ansible_facts', 'nodename']) |
|
|
||||||
list }}
|
|
||||||
# For ironic, use {{ansible_facts.hostname}}-ironic since this is what we
|
|
||||||
# configure for [DEFAULT] host in nova.conf.
|
|
||||||
ironic_compute_service_hosts: >-
|
|
||||||
{{ ironic_computes_in_batch |
|
|
||||||
map('extract', hostvars, ['ansible_facts', 'hostname']) |
|
|
||||||
map('regex_replace', '^(.*)$', '\1-ironic') |
|
|
||||||
list }}
|
|
||||||
set_fact:
|
|
||||||
expected_compute_service_hosts: "{{ virt_compute_service_hosts + ironic_compute_service_hosts }}"
|
|
||||||
delegate_to: "{{ groups[nova_cell_conductor_group][0] }}"
|
|
||||||
|
|
||||||
- name: Waiting for nova-compute services to register themselves
|
- block:
|
||||||
become: true
|
- import_tasks: get_cell_settings.yml
|
||||||
command: >
|
|
||||||
docker exec kolla_toolbox openstack
|
|
||||||
--os-interface {{ openstack_interface }}
|
|
||||||
--os-auth-url {{ openstack_auth.auth_url }}
|
|
||||||
--os-username {{ openstack_auth.username }}
|
|
||||||
--os-password {{ openstack_auth.password }}
|
|
||||||
--os-identity-api-version 3
|
|
||||||
--os-user-domain-name {{ openstack_auth.user_domain_name }}
|
|
||||||
--os-system-scope {{ openstack_auth.system_scope }}
|
|
||||||
--os-region-name {{ openstack_region_name }}
|
|
||||||
{% if openstack_cacert != '' %}--os-cacert {{ openstack_cacert }}{% endif %}
|
|
||||||
compute service list --format json --column Host --service nova-compute
|
|
||||||
register: nova_compute_services
|
|
||||||
changed_when: false
|
|
||||||
retries: 20
|
|
||||||
delay: 10
|
|
||||||
until:
|
|
||||||
- nova_compute_services is success
|
|
||||||
# A list containing the 'Host' field of compute services that have
|
|
||||||
# registered themselves. Don't exclude compute services that are disabled
|
|
||||||
# since these could have been explicitly disabled by the operator. While we
|
|
||||||
# could exclude services that are down, the nova-manage cell_v2
|
|
||||||
# discover_hosts does not do this so let's not block on it here.
|
|
||||||
# NOTE(mgoddard): Cannot factor this out into an intermediary variable
|
|
||||||
# before ansible 2.8, due to
|
|
||||||
# https://bugs.launchpad.net/kolla-ansible/+bug/1835817.
|
|
||||||
- (nova_compute_services.stdout |
|
|
||||||
from_json |
|
|
||||||
map(attribute='Host') |
|
|
||||||
list)
|
|
||||||
is superset(expected_compute_service_hosts)
|
|
||||||
delegate_to: "{{ groups[nova_cell_conductor_group][0] }}"
|
|
||||||
|
|
||||||
- import_tasks: get_cell_settings.yml
|
- name: Fail if cell settings not found
|
||||||
delegate_to: "{{ groups[nova_cell_conductor_group][0] }}"
|
fail:
|
||||||
|
msg: >-
|
||||||
|
Unable to find settings for {{ nova_cell_name or 'the default cell' }}.
|
||||||
|
when: not nova_cell_settings
|
||||||
|
|
||||||
- name: Fail if cell settings not found
|
# TODO(yoctozepto): no need to do --by-service if ironic not used
|
||||||
fail:
|
- name: Discover nova hosts
|
||||||
msg: >-
|
become: true
|
||||||
Unable to find settings for {{ nova_cell_name or 'the default cell' }}.
|
command: >
|
||||||
when: not nova_cell_settings
|
docker exec nova_conductor nova-manage cell_v2 discover_hosts --by-service --cell_uuid {{ nova_cell_settings.cell_uuid }}
|
||||||
delegate_to: "{{ groups[nova_cell_conductor_group][0] }}"
|
changed_when: False
|
||||||
|
|
||||||
# TODO(yoctozepto): no need to do --by-service if ironic not used
|
# Delegate to a cell conductor.
|
||||||
- name: Discover nova hosts
|
|
||||||
become: true
|
|
||||||
command: >
|
|
||||||
docker exec nova_conductor nova-manage cell_v2 discover_hosts --by-service --cell_uuid {{ nova_cell_settings.cell_uuid }}
|
|
||||||
changed_when: False
|
|
||||||
delegate_to: "{{ groups[nova_cell_conductor_group][0] }}"
|
delegate_to: "{{ groups[nova_cell_conductor_group][0] }}"
|
||||||
|
88
ansible/roles/nova-cell/tasks/wait_discover_computes.yml
Normal file
88
ansible/roles/nova-cell/tasks/wait_discover_computes.yml
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
---
|
||||||
|
# We need to wait for all expected compute services to register before running
|
||||||
|
# cells v2 host discovery. This includes virtualised compute services and
|
||||||
|
# ironic compute services.
|
||||||
|
# Work with --limit by including only hosts in ansible_play_batch.
|
||||||
|
|
||||||
|
- block:
|
||||||
|
- name: Waiting for nova-compute services to register themselves
|
||||||
|
become: true
|
||||||
|
command: >
|
||||||
|
docker exec kolla_toolbox openstack
|
||||||
|
--os-interface {{ openstack_interface }}
|
||||||
|
--os-auth-url {{ openstack_auth.auth_url }}
|
||||||
|
--os-username {{ openstack_auth.username }}
|
||||||
|
--os-password {{ openstack_auth.password }}
|
||||||
|
--os-identity-api-version 3
|
||||||
|
--os-user-domain-name {{ openstack_auth.user_domain_name }}
|
||||||
|
--os-system-scope {{ openstack_auth.system_scope }}
|
||||||
|
--os-region-name {{ openstack_region_name }}
|
||||||
|
{% if openstack_cacert != '' %}--os-cacert {{ openstack_cacert }}{% endif %}
|
||||||
|
compute service list --format json --column Host --service nova-compute
|
||||||
|
register: nova_compute_services
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
retries: 20
|
||||||
|
delay: 10
|
||||||
|
until:
|
||||||
|
- nova_compute_services is success
|
||||||
|
# A list containing the 'Host' field of compute services that have
|
||||||
|
# registered themselves. Don't exclude compute services that are disabled
|
||||||
|
# since these could have been explicitly disabled by the operator. While we
|
||||||
|
# could exclude services that are down, the nova-manage cell_v2
|
||||||
|
# discover_hosts does not do this so let's not block on it here.
|
||||||
|
- (nova_compute_services.stdout |
|
||||||
|
from_json |
|
||||||
|
map(attribute='Host') |
|
||||||
|
list)
|
||||||
|
is superset(expected_compute_service_hosts)
|
||||||
|
# Execute on one compute per cell, and delegate to a cell conductor.
|
||||||
|
when: inventory_hostname == all_computes_in_batch[0]
|
||||||
|
delegate_to: "{{ groups[nova_cell_conductor_group][0] }}"
|
||||||
|
|
||||||
|
# NOTE(mgoddard): Use a separate fail task to ensure we fail only those hosts
|
||||||
|
# that failed to register.
|
||||||
|
- name: Fail if nova-compute service failed to register
|
||||||
|
vars:
|
||||||
|
# 'Host' field of all registered compute services.
|
||||||
|
nova_compute_service_hosts: >-
|
||||||
|
{{ hostvars[all_computes_in_batch[0]].nova_compute_services.stdout |
|
||||||
|
from_json |
|
||||||
|
map(attribute='Host') |
|
||||||
|
list }}
|
||||||
|
# 'Host' field of failed compute services.
|
||||||
|
failed_compute_service_hosts: >-
|
||||||
|
{{ expected_compute_service_hosts | difference(nova_compute_service_hosts) | list }}
|
||||||
|
# Whether any compute services failed on this host.
|
||||||
|
any_failed_services: >-
|
||||||
|
{{ ansible_facts.nodename in failed_compute_service_hosts or
|
||||||
|
(ansible_facts.hostname ~ "-ironic") in failed_compute_service_hosts }}
|
||||||
|
fail:
|
||||||
|
msg: >-
|
||||||
|
The Nova compute service failed to register itself on the following
|
||||||
|
hosts: {{ failed_compute_service_hosts | join(',') }}
|
||||||
|
when: >-
|
||||||
|
any_failed_services or
|
||||||
|
(nova_compute_registration_fatal | bool and
|
||||||
|
failed_compute_service_hosts | length > 0)
|
||||||
|
vars:
|
||||||
|
# For virt, use ansible_facts.nodename rather than inventory_hostname, since this
|
||||||
|
# is similar to what nova uses internally as its default for the
|
||||||
|
# [DEFAULT] host config option.
|
||||||
|
virt_compute_service_hosts: >-
|
||||||
|
{{ virt_computes_in_batch |
|
||||||
|
map('extract', hostvars, ['ansible_facts', 'nodename']) |
|
||||||
|
list }}
|
||||||
|
# For ironic, use {{ansible_facts.hostname}}-ironic since this is what we
|
||||||
|
# configure for [DEFAULT] host in nova.conf.
|
||||||
|
ironic_compute_service_hosts: >-
|
||||||
|
{{ ironic_computes_in_batch |
|
||||||
|
map('extract', hostvars, ['ansible_facts', 'hostname']) |
|
||||||
|
map('regex_replace', '^(.*)$', '\1-ironic') |
|
||||||
|
list }}
|
||||||
|
expected_compute_service_hosts: "{{ virt_compute_service_hosts + ironic_compute_service_hosts }}"
|
||||||
|
|
||||||
|
- name: Include discover_computes.yml
|
||||||
|
include_tasks: discover_computes.yml
|
||||||
|
# Execute on one compute host per cell.
|
||||||
|
when: inventory_hostname == all_computes_in_batch[0]
|
@ -65,3 +65,17 @@ concept known as Vendordata. If a Vendordata file is located in the
|
|||||||
following path within the Kolla configuration, Kolla will
|
following path within the Kolla configuration, Kolla will
|
||||||
automatically use it when the Nova service is deployed or
|
automatically use it when the Nova service is deployed or
|
||||||
reconfigured: ``/etc/kolla/config/nova/vendordata.json``.
|
reconfigured: ``/etc/kolla/config/nova/vendordata.json``.
|
||||||
|
|
||||||
|
Failure handling
|
||||||
|
================
|
||||||
|
|
||||||
|
Compute service registration
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
During deployment, Kolla Ansible waits for Nova compute services to register
|
||||||
|
themselves. By default, if a compute service does not register itself before
|
||||||
|
the timeout, that host will be marked as failed in the Ansible run. This
|
||||||
|
behaviour is useful at scale, where failures are more frequent.
|
||||||
|
|
||||||
|
Alternatively, to fail all hosts in a cell when any compute service fails
|
||||||
|
to register, set ``nova_compute_registration_fatal`` to ``true``.
|
||||||
|
@ -0,0 +1,9 @@
|
|||||||
|
---
|
||||||
|
fixes:
|
||||||
|
- |
|
||||||
|
Fixes an issue where a failure of any Nova compute service to register
|
||||||
|
itself would cause only the host querying the nova API to fail.
|
||||||
|
Now, only hosts that fail to register will fail the Kolla Ansible run.
|
||||||
|
Alternatively, to fail all hosts in a cell when any compute service fails
|
||||||
|
to register, set ``nova_compute_registration_fatal`` to ``true``.
|
||||||
|
`LP#1940119 <https://bugs.launchpad.net/kolla-ansible/+bug/1940119>`__
|
Loading…
x
Reference in New Issue
Block a user