From d4530e242db7c45c10729123be8d7a8fbab38296 Mon Sep 17 00:00:00 2001 From: Andrew Bonney Date: Wed, 17 Apr 2024 08:23:25 +0100 Subject: [PATCH] Adjust RabbitMQ HA policy to make reply queues HA Changes in oslo.messaging for 2023.1 exposed a known race condition in RabbitMQ when dealing with non-HA classic queues. When a RMQ cluster member is taken down, clients failing over to other members may erroneously be told a queue exists when it is in the process of being deleted. This can cause them to permanently sit waiting for messages from a queue that no longer exists until their services are restarted. Making the reply queues HA resolves this issue, at the expense of a x3 increase in reply queues across the cluster. My assumption is that reply queues were previously excluded from HA policy as a performance gain given their link to the number of compute nodes in an OpenStack deployment. Context: https://bugs.launchpad.net/oslo.messaging/+bug/2031512 Depends-On: https://review.opendev.org/c/openstack/openstack-ansible-rabbitmq_server/+/916042 Change-Id: Iee6b5f8cc1ad04988c8634f8b6e026e2f8c75b52 --- inventory/group_vars/all/infra.yml | 2 +- .../notes/rmq-ha-reply-queue-policy-95a8fd54561fc9cd.yaml | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 releasenotes/notes/rmq-ha-reply-queue-policy-95a8fd54561fc9cd.yaml diff --git a/inventory/group_vars/all/infra.yml b/inventory/group_vars/all/infra.yml index 54f26f6abc..9f5fbdf3dd 100644 --- a/inventory/group_vars/all/infra.yml +++ b/inventory/group_vars/all/infra.yml @@ -32,7 +32,7 @@ oslomsg_rabbit_quorum_queues: False rabbitmq_policies: - name: "HA" - pattern: '^(?!(amq\.)|(.*_fanout_)|(reply_)).*' + pattern: '^(?!(amq\.)|(.*_fanout_)).*' priority: 0 tags: "ha-mode=all" state: "{{ (oslomsg_rabbit_quorum_queues | default(True) or not rabbitmq_queue_replication) | ternary('absent', 'present') }}" diff --git a/releasenotes/notes/rmq-ha-reply-queue-policy-95a8fd54561fc9cd.yaml b/releasenotes/notes/rmq-ha-reply-queue-policy-95a8fd54561fc9cd.yaml new file mode 100644 index 0000000000..8cff08643a --- /dev/null +++ b/releasenotes/notes/rmq-ha-reply-queue-policy-95a8fd54561fc9cd.yaml @@ -0,0 +1,7 @@ +--- +upgrade: + - | + When using RabbitMQ in a high availability cluster (non-quorum queues), + transient 'reply\_' queues are now included in the HA policy where they + previously were not. Note that this will increase the load on the RabbitMQ + cluster, particularly for deployments with large numbers of compute nodes.