kolla-ansible/ansible/roles/nova-cell/tasks/wait_discover_computes.yml
Mark Goddard f1d3ff11d0 nova: improve compute service registration failure handling
If any nova compute service fails to register itself, Kolla Ansible will
fail the host that queries the Nova API. This is the first compute host
in the inventory, and fails in the task:

    Waiting for nova-compute services to register themselves

Other hosts continue, often leading to further errors later on. Clearly
this is not idea.

This change modifies the behaviour to query the compute service list
until all expected hosts are present, but does not fail the querying
host if they are not. A new task is added that executes for all hosts,
and fails only those hosts that have not registered successfully.

Alternatively, to fail all hosts in a cell when any compute service
fails to register, set nova_compute_registration_fatal to true.

Change-Id: I12c1928cf1f1fb9e28f1741e7fe4968004ea1816
Closes-Bug: #1940119
2022-03-29 11:26:44 +01:00

89 lines
4.0 KiB
YAML

---
# We need to wait for all expected compute services to register before running
# cells v2 host discovery. This includes virtualised compute services and
# ironic compute services.
# Work with --limit by including only hosts in ansible_play_batch.
- block:
- name: Waiting for nova-compute services to register themselves
become: true
command: >
docker exec kolla_toolbox openstack
--os-interface {{ openstack_interface }}
--os-auth-url {{ openstack_auth.auth_url }}
--os-username {{ openstack_auth.username }}
--os-password {{ openstack_auth.password }}
--os-identity-api-version 3
--os-user-domain-name {{ openstack_auth.user_domain_name }}
--os-system-scope {{ openstack_auth.system_scope }}
--os-region-name {{ openstack_region_name }}
{% if openstack_cacert != '' %}--os-cacert {{ openstack_cacert }}{% endif %}
compute service list --format json --column Host --service nova-compute
register: nova_compute_services
changed_when: false
failed_when: false
retries: 20
delay: 10
until:
- nova_compute_services is success
# A list containing the 'Host' field of compute services that have
# registered themselves. Don't exclude compute services that are disabled
# since these could have been explicitly disabled by the operator. While we
# could exclude services that are down, the nova-manage cell_v2
# discover_hosts does not do this so let's not block on it here.
- (nova_compute_services.stdout |
from_json |
map(attribute='Host') |
list)
is superset(expected_compute_service_hosts)
# Execute on one compute per cell, and delegate to a cell conductor.
when: inventory_hostname == all_computes_in_batch[0]
delegate_to: "{{ groups[nova_cell_conductor_group][0] }}"
# NOTE(mgoddard): Use a separate fail task to ensure we fail only those hosts
# that failed to register.
- name: Fail if nova-compute service failed to register
vars:
# 'Host' field of all registered compute services.
nova_compute_service_hosts: >-
{{ hostvars[all_computes_in_batch[0]].nova_compute_services.stdout |
from_json |
map(attribute='Host') |
list }}
# 'Host' field of failed compute services.
failed_compute_service_hosts: >-
{{ expected_compute_service_hosts | difference(nova_compute_service_hosts) | list }}
# Whether any compute services failed on this host.
any_failed_services: >-
{{ ansible_facts.nodename in failed_compute_service_hosts or
(ansible_facts.hostname ~ "-ironic") in failed_compute_service_hosts }}
fail:
msg: >-
The Nova compute service failed to register itself on the following
hosts: {{ failed_compute_service_hosts | join(',') }}
when: >-
any_failed_services or
(nova_compute_registration_fatal | bool and
failed_compute_service_hosts | length > 0)
vars:
# For virt, use ansible_facts.nodename rather than inventory_hostname, since this
# is similar to what nova uses internally as its default for the
# [DEFAULT] host config option.
virt_compute_service_hosts: >-
{{ virt_computes_in_batch |
map('extract', hostvars, ['ansible_facts', 'nodename']) |
list }}
# For ironic, use {{ansible_facts.hostname}}-ironic since this is what we
# configure for [DEFAULT] host in nova.conf.
ironic_compute_service_hosts: >-
{{ ironic_computes_in_batch |
map('extract', hostvars, ['ansible_facts', 'hostname']) |
map('regex_replace', '^(.*)$', '\1-ironic') |
list }}
expected_compute_service_hosts: "{{ virt_compute_service_hosts + ironic_compute_service_hosts }}"
- name: Include discover_computes.yml
include_tasks: discover_computes.yml
# Execute on one compute host per cell.
when: inventory_hostname == all_computes_in_batch[0]