From 192dcd1e1b9baf7f3177a694c2b1ce8bd62d9159 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Fri, 22 Mar 2019 14:59:41 +0000 Subject: [PATCH] Fix booting instances after nova-compute upgrade After upgrading from Rocky to Stein, nova-compute services fail to start new instances with the following error message: Failed to allocate the network(s), not rescheduling. Looking in the nova-compute logs, we also see this: Neutron Reported failure on event network-vif-plugged-60c05a0d-8758-44c9-81e4-754551567be5 for instance 32c493c4-d88c-4f14-98db-c7af64bf3324: NovaException: In shutdown, no new events can be scheduled During the upgrade process, we send nova containers a SIGHUP to cause them to reload their object version state. Speaking to the nova team in IRC, there is a known issue with this, caused by oslo.service performing a full shutdown in response to a SIGHUP, which breaks nova-compute. There is a patch [1] in review to address this. The workaround employed here is to restart the nova compute service. [1] https://review.openstack.org/#/c/641907 Change-Id: Ia4fcc558a3f62ced2d629d7a22d0bc1eb6b879f1 Closes-Bug: #1821362 --- ansible/roles/nova/defaults/main.yml | 11 +++++++ ansible/roles/nova/tasks/config.yml | 12 +------ ansible/roles/nova/tasks/reload.yml | 49 +++++++++++++++++++--------- 3 files changed, 45 insertions(+), 27 deletions(-) diff --git a/ansible/roles/nova/defaults/main.yml b/ansible/roles/nova/defaults/main.yml index 8b0152ef3d..8081b7ad45 100644 --- a/ansible/roles/nova/defaults/main.yml +++ b/ansible/roles/nova/defaults/main.yml @@ -357,6 +357,17 @@ nova_safety_upgrade: "no" nova_libvirt_port: "16509" nova_ssh_port: "8022" +nova_services_require_nova_conf: + - nova-api + - nova-compute + - nova-compute-ironic + - nova-conductor + - nova-consoleauth + - nova-novncproxy + - nova-serialproxy + - nova-scheduler + - nova-spicehtml5proxy + #################### # Notification #################### diff --git a/ansible/roles/nova/tasks/config.yml b/ansible/roles/nova/tasks/config.yml index 60cab45be8..6c0349e2dc 100644 --- a/ansible/roles/nova/tasks/config.yml +++ b/ansible/roles/nova/tasks/config.yml @@ -81,16 +81,6 @@ - name: Copying over nova.conf become: true vars: - services_require_nova_conf: - - nova-api - - nova-compute - - nova-compute-ironic - - nova-conductor - - nova-consoleauth - - nova-novncproxy - - nova-serialproxy - - nova-scheduler - - nova-spicehtml5proxy service_name: "{{ item.key }}" merge_configs: sources: @@ -105,7 +95,7 @@ when: - inventory_hostname in groups[item.value.group] - item.value.enabled | bool - - item.key in services_require_nova_conf + - item.key in nova_services_require_nova_conf with_dict: "{{ nova_services }}" notify: - "Restart {{ item.key }} container" diff --git a/ansible/roles/nova/tasks/reload.yml b/ansible/roles/nova/tasks/reload.yml index 6b37a9091d..96f57f9681 100644 --- a/ansible/roles/nova/tasks/reload.yml +++ b/ansible/roles/nova/tasks/reload.yml @@ -1,21 +1,38 @@ --- # This play calls sighup on every service to refresh upgrade levels -- name: Sighup nova-api - command: docker exec -t nova_api kill -1 1 - when: inventory_hostname in groups['nova-api'] -- name: Sighup nova-conductor - command: docker exec -t nova_conductor kill -1 1 - when: inventory_hostname in groups['nova-conductor'] +# NOTE(mgoddard): Currently (just prior to Stein release), sending SIGHUP to +# nova compute services leaves them in a broken state in which they cannot +# start new instances. The following error is seen in the logs: +# "In shutdown, no new events can be scheduled" +# To work around this we restart the nova-compute services. +# Speaking to the nova team, this seems to be an issue in oslo.service, +# with a fix proposed here: https://review.openstack.org/#/c/641907. +# This issue also seems to affect the proxy services, which exit non-zero in +# reponse to a SIGHUP, so restart those too. +# TODO(mgoddard): Remove this workaround when this bug has been fixed. -- name: Sighup nova-consoleauth - command: docker exec -t nova_consoleauth kill -1 1 - when: inventory_hostname in groups['nova-consoleauth'] +- name: Send SIGHUP to nova services + become: true + command: docker exec -t {{ item.value.container_name }} kill -1 1 + when: + - inventory_hostname in groups[item.value.group] + - item.value.enabled | bool + - item.key in nova_services_require_nova_conf + - not item.key.startswith('nova-compute') + - not item.key.endswith('proxy') + with_dict: "{{ nova_services }}" -- name: Sighup nova-scheduler - command: docker exec -t nova_scheduler kill -1 1 - when: inventory_hostname in groups['nova-scheduler'] - -- name: Sighup nova-compute - command: docker exec -t nova_compute kill -1 1 - when: inventory_hostname in groups['compute'] +- name: Restart nova compute and proxy services + become: true + kolla_docker: + action: restart_container + common_options: "{{ docker_common_options }}" + name: "{{ item.value.container_name }}" + when: + - inventory_hostname in groups[item.value.group] + - item.value.enabled | bool + - item.key in nova_services_require_nova_conf + - item.key.startswith('nova-compute') + or item.key.endswith('proxy') + with_dict: "{{ nova_services }}"