Coordinate haproxy and keepalived restarts

Keepalived and haproxy cooperate to provide control plane HA in kolla-ansible deployments. Certain care should be exerted to avoid prolonged availability loss during reconfigurations and upgrades. This patch aims to provide this care. There is nothing special about keepalived upgrade compared to reconfig, hence it is simplified to run the same code as for deploy. The broken logic of safe upgrade is replaced by common handler code which's goal is to ensure we down current master only after we have backups ready. This change introduces a switch to kolla_docker module that allows to ignore missing containers (as they are logically stopped). ignore_missing is the switch's name. All tests are included. Change-Id: I22ddec5f7ee4a7d3d502649a158a7e005fe29c48
2020-04-27 10:40:15 +02:00 · 2020-04-27 10:40:15 +02:00 · c2d0bf30ea
commit c2d0bf30ea
parent 4c4ad2b87b
6 changed files with 186 additions and 35 deletions
--- a/ansible/library/kolla_docker.py
+++ b/ansible/library/kolla_docker.py
@ -936,8 +936,10 @@ class DockerWorker(object):
            graceful_timeout = 10
        container = self.check_container()
        if not container:
-            self.module.fail_json(
-                msg="No such container: {} to stop".format(name))
+            ignore_missing = self.params.get('ignore_missing')
+            if not ignore_missing:
+                self.module.fail_json(
+                    msg="No such container: {} to stop".format(name))
        elif not container['Status'].startswith('Exited '):
            self.changed = True
            self.dc.stop(name, timeout=graceful_timeout)
@ -1069,6 +1071,7 @@ def generate_module():
        dimensions=dict(required=False, type='dict', default=dict()),
        tty=dict(required=False, type='bool', default=False),
        client_timeout=dict(required=False, type='int', default=120),
+        ignore_missing=dict(required=False, type='bool', default=False),
    )
    required_if = [
        ['action', 'pull_image', ['image']],
--- a/ansible/roles/haproxy/handlers/main.yml
+++ b/ansible/roles/haproxy/handlers/main.yml
@ -1,5 +1,51 @@
 ---
- name: Restart haproxy container
+# NOTE(yoctozepto): this handler dance is to ensure we delay restarting master
+# keepalived and haproxy which control VIP address until we have working backups.
+# This could be improved by checking if backup keepalived do not report FAULT state.
+# Master node is handled specially to let it close down connections and only then
+# drop the VIP address by stopping keepalived service.
+
+# NOTE(yoctozepto): we need fresh VIP address placement info (facts may be old)
+- name: Check IP addresses on the API interface
+  vars:
+    version: "{{ '6' if api_address_family == 'ipv6' else '4' }}"
+  become: true
+  command: ip -{{ version }} -o addr show dev {{ api_interface }}
+  register: ip_addr_output
+  changed_when: false
+  when:
+    - kolla_action != "config"
+  listen:
+    - Restart haproxy container
+    - Restart keepalived container
+
+- name: Group HA nodes by status
+  vars:
+    re_safe_address: "{{ kolla_internal_vip_address | regex_escape }}"
+  group_by:
+    key: kolla_ha_is_master_{{ ip_addr_output.stdout is regex('\b' + re_safe_address + '\b') }}
+  when:
+    - kolla_action != "config"
+  listen:
+    - Restart haproxy container
+    - Restart keepalived container
+
+- name: Stop backup keepalived container
+  become: true
+  kolla_docker:
+    action: "stop_container"
+    # NOTE(yoctozepto): backup node might not have keepalived yet - ignore
+    ignore_missing: true
+    common_options: "{{ docker_common_options }}"
+    name: "keepalived"
+  when:
+    - kolla_action != "config"
+    - groups.kolla_ha_is_master_False is defined
+    - inventory_hostname in groups.kolla_ha_is_master_False
+  listen:
+    - Restart keepalived container
+
+- name: Restart backup haproxy container
  vars:
    service_name: "haproxy"
    service: "{{ haproxy_services[service_name] }}"
@ -14,12 +60,20 @@
    dimensions: "{{ service.dimensions }}"
  when:
    - kolla_action != "config"
-    - inventory_hostname in groups[service.group]
-    - service.enabled | bool
+    - groups.kolla_ha_is_master_False is defined
+    - inventory_hostname in groups.kolla_ha_is_master_False
+  listen:
+    - Restart haproxy container
+    - Restart keepalived container
  notify:
-    - Waiting for haproxy to start
+    - Wait for backup haproxy to start

- name: Restart keepalived container
+- name: Wait for backup haproxy to start
+  wait_for:
+    host: "{{ api_interface_address }}"
+    port: "{{ haproxy_monitor_port }}"
+
+- name: Start backup keepalived container
  vars:
    service_name: "keepalived"
    service: "{{ haproxy_services[service_name] }}"
@ -34,17 +88,92 @@
    dimensions: "{{ service.dimensions }}"
  when:
    - kolla_action != "config"
-    - inventory_hostname in groups[service.group]
-    - service.enabled | bool
+    - groups.kolla_ha_is_master_False is defined
+    - inventory_hostname in groups.kolla_ha_is_master_False
+  listen:
+    - Restart keepalived container
  notify:
-    - Waiting for virtual IP to appear
+    - Wait for virtual IP to appear

- name: Waiting for haproxy to start
+# NOTE(yoctozepto): This is to ensure haproxy can close any open connections
+# to the VIP address.
+- name: Stop master haproxy container
+  become: true
+  kolla_docker:
+    action: "stop_container"
+    common_options: "{{ docker_common_options }}"
+    name: "haproxy"
+  when:
+    - kolla_action != "config"
+    - groups.kolla_ha_is_master_True is defined
+    - inventory_hostname in groups.kolla_ha_is_master_True
+  listen:
+    - Restart keepalived container
+
+- name: Stop master keepalived container
+  become: true
+  kolla_docker:
+    action: "stop_container"
+    common_options: "{{ docker_common_options }}"
+    name: "keepalived"
+  when:
+    - kolla_action != "config"
+    - groups.kolla_ha_is_master_True is defined
+    - inventory_hostname in groups.kolla_ha_is_master_True
+  listen:
+    - Restart keepalived container
+
+- name: Start master haproxy container
+  vars:
+    service_name: "haproxy"
+    service: "{{ haproxy_services[service_name] }}"
+  become: true
+  kolla_docker:
+    action: "recreate_or_restart_container"
+    common_options: "{{ docker_common_options }}"
+    name: "{{ service.container_name }}"
+    image: "{{ service.image }}"
+    privileged: "{{ service.privileged | default(False) }}"
+    volumes: "{{ service.volumes }}"
+    dimensions: "{{ service.dimensions }}"
+  when:
+    - kolla_action != "config"
+    - groups.kolla_ha_is_master_True is defined
+    - inventory_hostname in groups.kolla_ha_is_master_True
+  listen:
+    - Restart haproxy container
+    - Restart keepalived container
+  notify:
+    - Wait for master haproxy to start
+
+- name: Wait for master haproxy to start
  wait_for:
    host: "{{ api_interface_address }}"
    port: "{{ haproxy_monitor_port }}"

- name: Waiting for virtual IP to appear
+- name: Start master keepalived container
+  vars:
+    service_name: "keepalived"
+    service: "{{ haproxy_services[service_name] }}"
+  become: true
+  kolla_docker:
+    action: "recreate_or_restart_container"
+    common_options: "{{ docker_common_options }}"
+    name: "{{ service.container_name }}"
+    image: "{{ service.image }}"
+    privileged: "{{ service.privileged | default(False) }}"
+    volumes: "{{ service.volumes }}"
+    dimensions: "{{ service.dimensions }}"
+  when:
+    - kolla_action != "config"
+    - groups.kolla_ha_is_master_True is defined
+    - inventory_hostname in groups.kolla_ha_is_master_True
+  listen:
+    - Restart keepalived container
+  notify:
+    - Wait for virtual IP to appear
+
+- name: Wait for virtual IP to appear
  wait_for:
    host: "{{ kolla_internal_vip_address }}"
    port: "{{ haproxy_monitor_port }}"
--- a/ansible/roles/haproxy/tasks/check-containers.yml
+++ b/ansible/roles/haproxy/tasks/check-containers.yml
@ -1,5 +1,5 @@
 ---
- name: Deploy haproxy containers
+- name: Check haproxy containers
  become: true
  kolla_docker:
    action: "compare_container"
--- a/ansible/roles/haproxy/tasks/upgrade.yml
+++ b/ansible/roles/haproxy/tasks/upgrade.yml
@ -1,22 +1,2 @@
 ---
- import_tasks: config-host.yml
-
- import_tasks: config.yml
-
- name: Stopping all slave keepalived containers
-  vars:
-    key: "{{ 'ipv6' if api_address_family == 'ipv6' else 'ipv4_secondaries' }}"
-    addresses: "{{ hostvars[inventory_hostname]['ansible_' + api_interface].get(key, []) | map(attribute='address') | list }}"
-  become: true
-  kolla_docker:
-    action: "stop_container"
-    common_options: "{{ docker_common_options }}"
-    name: "keepalived"
-  when: kolla_internal_vip_address not in addresses
-  notify:
-    - Restart keepalived container
-
-# NOTE(yoctozepto): haproxy role handlers should not be flushed early.
-# site.yml handles all haproxy things in a dedicated play.
-# This is to avoid extra haproxy service restart.
-# See: https://bugs.launchpad.net/kolla-ansible/+bug/1875228
+- import_tasks: deploy.yml
--- a/releasenotes/notes/bug-1863510-part-haproxy-0a76829d48818f92.yaml
+++ b/releasenotes/notes/bug-1863510-part-haproxy-0a76829d48818f92.yaml
@ -0,0 +1,5 @@
+---
+fixes:
+  - |
+    Makes haproxy and keepalived restarts during Kolla-Ansible actions more
+    robust, especially in multinode scenarios (HA).
--- a/tests/test_kolla_docker.py
+++ b/tests/test_kolla_docker.py
@ -94,6 +94,7 @@ class ModuleArgsTest(base.BaseTestCase):
            tty=dict(required=False, type='bool', default=False),
            client_timeout=dict(required=False, type='int', default=120),
            healthcheck=dict(required=False, type='dict'),
+            ignore_missing=dict(required=False, type='bool', default=False),
        )
        required_if = [
            ['action', 'pull_image', ['image']],
@ -175,7 +176,15 @@ FAKE_DATA = {
         'Image': 'myregistrydomain.com:5000/ubuntu:16.04',
         'ImageID': 'sha256:c5f1cf30',
         'Labels': {},
-         'Names': '/my_container'}
+         'Names': '/my_container'},
+        {'Created': 1463578195,
+         'Status': 'Exited (0) 2 hours ago',
+         'HostConfig': {'NetworkMode': 'default'},
+         'Id': 'e40d8e7188',
+         'Image': 'myregistrydomain.com:5000/ubuntu:16.04',
+         'ImageID': 'sha256:c5f1cf30',
+         'Labels': {},
+         'Names': '/exited_container'},
    ],

    'container_inspect': {
@ -396,6 +405,18 @@ class TestContainer(base.BaseTestCase):
        self.assertTrue(self.dw.changed)
        self.dw.dc.containers.assert_called_once_with(all=True)
        self.dw.dc.stop.assert_called_once_with('my_container', timeout=10)
+        self.dw.module.fail_json.assert_not_called()
+
+    def test_stop_container_already_stopped(self):
+        self.dw = get_DockerWorker({'name': 'exited_container',
+                                    'action': 'stop_container'})
+        self.dw.dc.containers.return_value = self.fake_data['containers']
+        self.dw.stop_container()
+
+        self.assertFalse(self.dw.changed)
+        self.dw.dc.containers.assert_called_once_with(all=True)
+        self.dw.module.fail_json.assert_not_called()
+        self.dw.dc.stop.assert_not_called()

    def test_stop_container_not_exists(self):
        self.dw = get_DockerWorker({'name': 'fake_container',
@ -405,9 +426,22 @@ class TestContainer(base.BaseTestCase):

        self.assertFalse(self.dw.changed)
        self.dw.dc.containers.assert_called_once_with(all=True)
+        self.dw.dc.stop.assert_not_called()
        self.dw.module.fail_json.assert_called_once_with(
            msg="No such container: fake_container to stop")

+    def test_stop_container_not_exists_ignore_missing(self):
+        self.dw = get_DockerWorker({'name': 'fake_container',
+                                    'action': 'stop_container',
+                                    'ignore_missing': True})
+        self.dw.dc.containers.return_value = self.fake_data['containers']
+        self.dw.stop_container()
+
+        self.assertFalse(self.dw.changed)
+        self.dw.dc.containers.assert_called_once_with(all=True)
+        self.dw.dc.stop.assert_not_called()
+        self.dw.module.fail_json.assert_not_called()
+
    def test_stop_and_remove_container(self):
        self.dw = get_DockerWorker({'name': 'my_container',
                                    'action': 'stop_and_remove_container'})