Coordinate haproxy and keepalived restarts

Keepalived and haproxy cooperate to provide control plane HA in
kolla-ansible deployments.
Certain care should be exerted to avoid prolonged availability
loss during reconfigurations and upgrades.
This patch aims to provide this care.
There is nothing special about keepalived upgrade compared to
reconfig, hence it is simplified to run the same code as for
deploy.
The broken logic of safe upgrade is replaced by common handler
code which's goal is to ensure we down current master only after
we have backups ready.

This change introduces a switch to kolla_docker module that allows
to ignore missing containers (as they are logically stopped).
ignore_missing is the switch's name.
All tests are included.

Change-Id: I22ddec5f7ee4a7d3d502649a158a7e005fe29c48
This commit is contained in:
Radosław Piliszek 2020-04-27 10:40:15 +02:00
parent 4c4ad2b87b
commit c2d0bf30ea
6 changed files with 186 additions and 35 deletions

View File

@ -936,8 +936,10 @@ class DockerWorker(object):
graceful_timeout = 10
container = self.check_container()
if not container:
self.module.fail_json(
msg="No such container: {} to stop".format(name))
ignore_missing = self.params.get('ignore_missing')
if not ignore_missing:
self.module.fail_json(
msg="No such container: {} to stop".format(name))
elif not container['Status'].startswith('Exited '):
self.changed = True
self.dc.stop(name, timeout=graceful_timeout)
@ -1069,6 +1071,7 @@ def generate_module():
dimensions=dict(required=False, type='dict', default=dict()),
tty=dict(required=False, type='bool', default=False),
client_timeout=dict(required=False, type='int', default=120),
ignore_missing=dict(required=False, type='bool', default=False),
)
required_if = [
['action', 'pull_image', ['image']],

View File

@ -1,5 +1,51 @@
---
- name: Restart haproxy container
# NOTE(yoctozepto): this handler dance is to ensure we delay restarting master
# keepalived and haproxy which control VIP address until we have working backups.
# This could be improved by checking if backup keepalived do not report FAULT state.
# Master node is handled specially to let it close down connections and only then
# drop the VIP address by stopping keepalived service.
# NOTE(yoctozepto): we need fresh VIP address placement info (facts may be old)
- name: Check IP addresses on the API interface
vars:
version: "{{ '6' if api_address_family == 'ipv6' else '4' }}"
become: true
command: ip -{{ version }} -o addr show dev {{ api_interface }}
register: ip_addr_output
changed_when: false
when:
- kolla_action != "config"
listen:
- Restart haproxy container
- Restart keepalived container
- name: Group HA nodes by status
vars:
re_safe_address: "{{ kolla_internal_vip_address | regex_escape }}"
group_by:
key: kolla_ha_is_master_{{ ip_addr_output.stdout is regex('\b' + re_safe_address + '\b') }}
when:
- kolla_action != "config"
listen:
- Restart haproxy container
- Restart keepalived container
- name: Stop backup keepalived container
become: true
kolla_docker:
action: "stop_container"
# NOTE(yoctozepto): backup node might not have keepalived yet - ignore
ignore_missing: true
common_options: "{{ docker_common_options }}"
name: "keepalived"
when:
- kolla_action != "config"
- groups.kolla_ha_is_master_False is defined
- inventory_hostname in groups.kolla_ha_is_master_False
listen:
- Restart keepalived container
- name: Restart backup haproxy container
vars:
service_name: "haproxy"
service: "{{ haproxy_services[service_name] }}"
@ -14,12 +60,20 @@
dimensions: "{{ service.dimensions }}"
when:
- kolla_action != "config"
- inventory_hostname in groups[service.group]
- service.enabled | bool
- groups.kolla_ha_is_master_False is defined
- inventory_hostname in groups.kolla_ha_is_master_False
listen:
- Restart haproxy container
- Restart keepalived container
notify:
- Waiting for haproxy to start
- Wait for backup haproxy to start
- name: Restart keepalived container
- name: Wait for backup haproxy to start
wait_for:
host: "{{ api_interface_address }}"
port: "{{ haproxy_monitor_port }}"
- name: Start backup keepalived container
vars:
service_name: "keepalived"
service: "{{ haproxy_services[service_name] }}"
@ -34,17 +88,92 @@
dimensions: "{{ service.dimensions }}"
when:
- kolla_action != "config"
- inventory_hostname in groups[service.group]
- service.enabled | bool
- groups.kolla_ha_is_master_False is defined
- inventory_hostname in groups.kolla_ha_is_master_False
listen:
- Restart keepalived container
notify:
- Waiting for virtual IP to appear
- Wait for virtual IP to appear
- name: Waiting for haproxy to start
# NOTE(yoctozepto): This is to ensure haproxy can close any open connections
# to the VIP address.
- name: Stop master haproxy container
become: true
kolla_docker:
action: "stop_container"
common_options: "{{ docker_common_options }}"
name: "haproxy"
when:
- kolla_action != "config"
- groups.kolla_ha_is_master_True is defined
- inventory_hostname in groups.kolla_ha_is_master_True
listen:
- Restart keepalived container
- name: Stop master keepalived container
become: true
kolla_docker:
action: "stop_container"
common_options: "{{ docker_common_options }}"
name: "keepalived"
when:
- kolla_action != "config"
- groups.kolla_ha_is_master_True is defined
- inventory_hostname in groups.kolla_ha_is_master_True
listen:
- Restart keepalived container
- name: Start master haproxy container
vars:
service_name: "haproxy"
service: "{{ haproxy_services[service_name] }}"
become: true
kolla_docker:
action: "recreate_or_restart_container"
common_options: "{{ docker_common_options }}"
name: "{{ service.container_name }}"
image: "{{ service.image }}"
privileged: "{{ service.privileged | default(False) }}"
volumes: "{{ service.volumes }}"
dimensions: "{{ service.dimensions }}"
when:
- kolla_action != "config"
- groups.kolla_ha_is_master_True is defined
- inventory_hostname in groups.kolla_ha_is_master_True
listen:
- Restart haproxy container
- Restart keepalived container
notify:
- Wait for master haproxy to start
- name: Wait for master haproxy to start
wait_for:
host: "{{ api_interface_address }}"
port: "{{ haproxy_monitor_port }}"
- name: Waiting for virtual IP to appear
- name: Start master keepalived container
vars:
service_name: "keepalived"
service: "{{ haproxy_services[service_name] }}"
become: true
kolla_docker:
action: "recreate_or_restart_container"
common_options: "{{ docker_common_options }}"
name: "{{ service.container_name }}"
image: "{{ service.image }}"
privileged: "{{ service.privileged | default(False) }}"
volumes: "{{ service.volumes }}"
dimensions: "{{ service.dimensions }}"
when:
- kolla_action != "config"
- groups.kolla_ha_is_master_True is defined
- inventory_hostname in groups.kolla_ha_is_master_True
listen:
- Restart keepalived container
notify:
- Wait for virtual IP to appear
- name: Wait for virtual IP to appear
wait_for:
host: "{{ kolla_internal_vip_address }}"
port: "{{ haproxy_monitor_port }}"

View File

@ -1,5 +1,5 @@
---
- name: Deploy haproxy containers
- name: Check haproxy containers
become: true
kolla_docker:
action: "compare_container"

View File

@ -1,22 +1,2 @@
---
- import_tasks: config-host.yml
- import_tasks: config.yml
- name: Stopping all slave keepalived containers
vars:
key: "{{ 'ipv6' if api_address_family == 'ipv6' else 'ipv4_secondaries' }}"
addresses: "{{ hostvars[inventory_hostname]['ansible_' + api_interface].get(key, []) | map(attribute='address') | list }}"
become: true
kolla_docker:
action: "stop_container"
common_options: "{{ docker_common_options }}"
name: "keepalived"
when: kolla_internal_vip_address not in addresses
notify:
- Restart keepalived container
# NOTE(yoctozepto): haproxy role handlers should not be flushed early.
# site.yml handles all haproxy things in a dedicated play.
# This is to avoid extra haproxy service restart.
# See: https://bugs.launchpad.net/kolla-ansible/+bug/1875228
- import_tasks: deploy.yml

View File

@ -0,0 +1,5 @@
---
fixes:
- |
Makes haproxy and keepalived restarts during Kolla-Ansible actions more
robust, especially in multinode scenarios (HA).

View File

@ -94,6 +94,7 @@ class ModuleArgsTest(base.BaseTestCase):
tty=dict(required=False, type='bool', default=False),
client_timeout=dict(required=False, type='int', default=120),
healthcheck=dict(required=False, type='dict'),
ignore_missing=dict(required=False, type='bool', default=False),
)
required_if = [
['action', 'pull_image', ['image']],
@ -175,7 +176,15 @@ FAKE_DATA = {
'Image': 'myregistrydomain.com:5000/ubuntu:16.04',
'ImageID': 'sha256:c5f1cf30',
'Labels': {},
'Names': '/my_container'}
'Names': '/my_container'},
{'Created': 1463578195,
'Status': 'Exited (0) 2 hours ago',
'HostConfig': {'NetworkMode': 'default'},
'Id': 'e40d8e7188',
'Image': 'myregistrydomain.com:5000/ubuntu:16.04',
'ImageID': 'sha256:c5f1cf30',
'Labels': {},
'Names': '/exited_container'},
],
'container_inspect': {
@ -396,6 +405,18 @@ class TestContainer(base.BaseTestCase):
self.assertTrue(self.dw.changed)
self.dw.dc.containers.assert_called_once_with(all=True)
self.dw.dc.stop.assert_called_once_with('my_container', timeout=10)
self.dw.module.fail_json.assert_not_called()
def test_stop_container_already_stopped(self):
self.dw = get_DockerWorker({'name': 'exited_container',
'action': 'stop_container'})
self.dw.dc.containers.return_value = self.fake_data['containers']
self.dw.stop_container()
self.assertFalse(self.dw.changed)
self.dw.dc.containers.assert_called_once_with(all=True)
self.dw.module.fail_json.assert_not_called()
self.dw.dc.stop.assert_not_called()
def test_stop_container_not_exists(self):
self.dw = get_DockerWorker({'name': 'fake_container',
@ -405,9 +426,22 @@ class TestContainer(base.BaseTestCase):
self.assertFalse(self.dw.changed)
self.dw.dc.containers.assert_called_once_with(all=True)
self.dw.dc.stop.assert_not_called()
self.dw.module.fail_json.assert_called_once_with(
msg="No such container: fake_container to stop")
def test_stop_container_not_exists_ignore_missing(self):
self.dw = get_DockerWorker({'name': 'fake_container',
'action': 'stop_container',
'ignore_missing': True})
self.dw.dc.containers.return_value = self.fake_data['containers']
self.dw.stop_container()
self.assertFalse(self.dw.changed)
self.dw.dc.containers.assert_called_once_with(all=True)
self.dw.dc.stop.assert_not_called()
self.dw.module.fail_json.assert_not_called()
def test_stop_and_remove_container(self):
self.dw = get_DockerWorker({'name': 'my_container',
'action': 'stop_and_remove_container'})