From 56b4352f9edd23c38bb24ff6eaec6748858e3e9f Mon Sep 17 00:00:00 2001
From: Mark Goddard <mark@stackhpc.com>
Date: Mon, 17 Sep 2018 16:01:52 +0100
Subject: [PATCH] Fix fact gathering with --limit

Prior to this change, when the --limit argument is used, each host in the
limit gathers facts for every other host. This is clearly unnecessary, and
can result in up to (N-1)^2 fact gathers.

This change gathers facts for each host only once. Hosts that are not in
the limit are divided between those that are in the limit, and facts are
gathered via delegation.

This change also factors out the fact gathering logic into a separate
playbook that is imported where necessary.

Change-Id: I923df5af41a7f1b7b0142d0da185a9a0979be543
---
 ansible/gather-facts.yml | 44 ++++++++++++++++++++++++++++++++++++++++
 ansible/kolla-host.yml   | 32 +----------------------------
 ansible/site.yml         | 33 +-----------------------------
 3 files changed, 46 insertions(+), 63 deletions(-)
 create mode 100644 ansible/gather-facts.yml
diff --git a/ansible/gather-facts.yml b/ansible/gather-facts.yml
new file mode 100644
index 0000000000..8ec05730fb
--- /dev/null
+++ b/ansible/gather-facts.yml
@@ -0,0 +1,44 @@
+---
+# NOTE(awiddersheim): Gather facts for all hosts as a
+# first step since several plays below require them when
+# building their configurations. The below 'gather_facts'
+# set to 'false' is a bit confusing but this is to avoid
+# Ansible gathering facts twice.
+- name: Gather facts for all hosts
+  hosts: all
+  serial: '{{ kolla_serial|default("0") }}'
+  gather_facts: false
+  tasks:
+    - name: Gather facts
+      setup:
+
+    - name: Group hosts to determine when using --limit
+      group_by:
+        key: "all_using_limit_{{ (ansible_play_batch | length) != (groups['all'] | length) }}"
+  tags: always
+
+# NOTE(pbourke): This case covers deploying subsets of hosts using --limit. The
+# limit arg will cause the first play to gather facts only about that node,
+# meaning facts such as IP addresses for rabbitmq nodes etc. will be undefined
+# in the case of adding a single compute node.
+# NOTE(mgoddard): Divide all hosts to be queried between the hosts selected via
+# the limit.
+- name: Gather facts for all hosts (if using --limit)
+  hosts: all_using_limit_True
+  serial: '{{ kolla_serial|default("0") }}'
+  gather_facts: false
+  vars:
+    batch_index: "{{ ansible_play_batch.index(inventory_hostname) }}"
+    batch_size: "{{ ansible_play_batch | length }}"
+    # Use a python list slice to divide the group up.
+    # Syntax: [<start index>:<end index>:<step size>]
+    delegate_hosts: "{{ groups['all'][batch_index | int::batch_size | int] }}"
+  tasks:
+    - name: Gather facts
+      setup:
+      delegate_facts: True
+      delegate_to: "{{ item }}"
+      with_items: "{{ delegate_hosts }}"
+      # We gathered facts for all hosts in the batch during the first play.
+      when: item not in ansible_play_batch
+  tags: always
diff --git a/ansible/kolla-host.yml b/ansible/kolla-host.yml
index e0b9bec9be..6ebb25e95e 100644
--- a/ansible/kolla-host.yml
+++ b/ansible/kolla-host.yml
@@ -1,35 +1,5 @@
 ---
-# NOTE(awiddersheim): Gather facts for all hosts as a
-# first step since several plays below require them when
-# building their configurations. The below 'gather_facts'
-# set to 'false' is a bit confusing but this is to avoid
-# Ansible gathering facts twice.
-- name: Gather facts for all hosts
-  hosts: all
-  serial: '{{ kolla_serial|default("0") }}'
-  gather_facts: false
-  tasks:
-    - setup:
-  tags: always
-
-# NOTE(pbourke): This case covers deploying subsets of hosts using --limit. The
-# limit arg will cause the first play to gather facts only about that node,
-# meaning facts such as IP addresses for rabbitmq nodes etc. will be undefined
-# in the case of adding a single compute node.
-# We don't want to add the delegate parameters to the above play as it will
-# result in ((num_nodes-1)^2) number of SSHs when running for all nodes
-# which can be very inefficient.
-- name: Gather facts for all hosts (if using --limit)
-  hosts: all
-  serial: '{{ kolla_serial|default("0") }}'
-  gather_facts: false
-  tasks:
-    - setup:
-      delegate_facts: True
-      delegate_to: "{{ item }}"
-      with_items: "{{ groups['all'] }}"
-      when:
-        - (ansible_play_batch | length) != (groups['all'] | length)
+- import_playbook: gather-facts.yml
 
 - name: Apply role baremetal
   hosts: baremetal
diff --git a/ansible/site.yml b/ansible/site.yml
index 818c8407dc..3cc9b0b886 100644
--- a/ansible/site.yml
+++ b/ansible/site.yml
@@ -1,36 +1,5 @@
 ---
-# NOTE(awiddersheim): Gather facts for all hosts as a
-# first step since several plays below require them when
-# building their configurations. The below 'gather_facts'
-# set to 'false' is a bit confusing but this is to avoid
-# Ansible gathering facts twice.
-- name: Gather facts for all hosts
-  hosts: all
-  serial: '{{ kolla_serial|default("0") }}'
-  gather_facts: false
-  tasks:
-    - setup:
-  tags: always
-
-# NOTE(pbourke): This case covers deploying subsets of hosts using --limit. The
-# limit arg will cause the first play to gather facts only about that node,
-# meaning facts such as IP addresses for rabbitmq nodes etc. will be undefined
-# in the case of adding a single compute node.
-# We don't want to add the delegate parameters to the above play as it will
-# result in ((num_nodes-1)^2) number of SSHs when running for all nodes
-# which can be very inefficient.
-- name: Gather facts for all hosts (if using --limit)
-  hosts: all
-  serial: '{{ kolla_serial|default("0") }}'
-  gather_facts: false
-  tasks:
-    - setup:
-      delegate_facts: True
-      delegate_to: "{{ item }}"
-      with_items: "{{ groups['all'] }}"
-      when:
-        - (ansible_play_batch | length) != (groups['all'] | length)
-  tags: always
+- import_playbook: gather-facts.yml
 
 # NOTE(mgoddard): In large environments, even tasks that are skipped can take a
 # significant amount of time. This is an optimisation to prevent any tasks