etcd: Add support for more scenarios
This commit addresses a few shortcomings in the etcd service: * Adding or removing etcd nodes required manual intervention. * The etcd service would have brief outages during upgrades or reconfigures because restarts weren't always serialised. This makes the etcd service follow a similar pattern to mariadb: * There is now a distiction between bootstrapping the cluster and adding / removing another member. * This more closely follows etcd's upstream bootstrapping guidelines. * The etcd role now serialises restarts internally so the kolla_serial pattern is no longer appropriate (or necessary). This does not remove the need for manual intervention in all failure modes: the documentation has been updated to address the most common issues. Note that there's repetition in the container specifications: this is somewhat deliberate. In a future cleanup, it's intended to reduce the duplication. Change-Id: I39829ba0c5894f8e549f9b83b416e6db4fafd96f
This commit is contained in:
parent
db79eb0a55
commit
ed3b27cc92
@ -5,15 +5,18 @@ etcd_services:
|
|||||||
group: etcd
|
group: etcd
|
||||||
enabled: true
|
enabled: true
|
||||||
environment:
|
environment:
|
||||||
|
# KOLLA_BOOTSTRAP_STATUS is used to indicate whether the container should
|
||||||
|
# be recreated. Otherwise the kolla_container task doesn't detect that the
|
||||||
|
# environment has changed if variables are removed.
|
||||||
|
KOLLA_BOOTSTRAP_STATUS: "bootstrap completed"
|
||||||
|
ETCDCTL_API: "3"
|
||||||
|
ETCDCTL_ENDPOINTS: "{{ etcd_client_internal_endpoint }}"
|
||||||
|
ETCDCTL_WRITE_OUT: "json"
|
||||||
ETCD_DATA_DIR: "/var/lib/etcd"
|
ETCD_DATA_DIR: "/var/lib/etcd"
|
||||||
ETCD_NAME: "{{ ansible_facts.hostname }}"
|
ETCD_NAME: "{{ ansible_facts.hostname }}"
|
||||||
ETCD_ADVERTISE_CLIENT_URLS: "{{ etcd_client_internal_endpoint }}"
|
ETCD_ADVERTISE_CLIENT_URLS: "{{ etcd_client_internal_endpoint }}"
|
||||||
ETCD_LISTEN_CLIENT_URLS: "{{ etcd_client_internal_endpoint }}"
|
ETCD_LISTEN_CLIENT_URLS: "{{ etcd_client_internal_endpoint }}"
|
||||||
ETCD_INITIAL_ADVERTISE_PEER_URLS: "{{ etcd_peer_internal_endpoint }}"
|
|
||||||
ETCD_LISTEN_PEER_URLS: "{{ etcd_peer_internal_endpoint }}"
|
ETCD_LISTEN_PEER_URLS: "{{ etcd_peer_internal_endpoint }}"
|
||||||
ETCD_INITIAL_CLUSTER_TOKEN: "{{ etcd_cluster_token }}"
|
|
||||||
ETCD_INITIAL_CLUSTER: "{% for host in groups['etcd'] %}{{ hostvars[host].ansible_facts.hostname }}={{ etcd_protocol }}://{{ 'api' | kolla_address(host) | put_address_in_context('url') }}:{{ etcd_peer_port }}{% if not loop.last %},{% endif %}{% endfor %}"
|
|
||||||
ETCD_INITIAL_CLUSTER_STATE: "new"
|
|
||||||
ETCD_OUT_FILE: "/var/log/kolla/etcd/etcd.log"
|
ETCD_OUT_FILE: "/var/log/kolla/etcd/etcd.log"
|
||||||
KOLLA_CONFIG_STRATEGY: "{{ config_strategy }}"
|
KOLLA_CONFIG_STRATEGY: "{{ config_strategy }}"
|
||||||
ETCD_CERT_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-cert.pem{% endif %}"
|
ETCD_CERT_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-cert.pem{% endif %}"
|
||||||
@ -52,3 +55,8 @@ etcd_extra_volumes: "{{ default_extra_volumes }}"
|
|||||||
############
|
############
|
||||||
etcd_client_internal_endpoint: "{{ etcd_protocol }}://{{ api_interface_address | put_address_in_context('url') }}:{{ etcd_client_port }}"
|
etcd_client_internal_endpoint: "{{ etcd_protocol }}://{{ api_interface_address | put_address_in_context('url') }}:{{ etcd_client_port }}"
|
||||||
etcd_peer_internal_endpoint: "{{ etcd_protocol }}://{{ api_interface_address | put_address_in_context('url') }}:{{ etcd_peer_port }}"
|
etcd_peer_internal_endpoint: "{{ etcd_protocol }}://{{ api_interface_address | put_address_in_context('url') }}:{{ etcd_peer_port }}"
|
||||||
|
|
||||||
|
###################
|
||||||
|
# Managing members
|
||||||
|
###################
|
||||||
|
etcd_remove_deleted_members: "no"
|
||||||
|
@ -1,16 +1,59 @@
|
|||||||
---
|
---
|
||||||
- name: Restart etcd container
|
- name: Bootstrap etcd on new cluster
|
||||||
vars:
|
include_tasks: 'bootstrap_cluster.yml'
|
||||||
service_name: "etcd"
|
|
||||||
service: "{{ etcd_services[service_name] }}"
|
|
||||||
become: true
|
|
||||||
kolla_container:
|
|
||||||
action: "recreate_or_restart_container"
|
|
||||||
common_options: "{{ docker_common_options }}"
|
|
||||||
name: "{{ service.container_name }}"
|
|
||||||
image: "{{ service.image }}"
|
|
||||||
environment: "{{ service.environment }}"
|
|
||||||
volumes: "{{ service.volumes }}"
|
|
||||||
dimensions: "{{ service.dimensions }}"
|
|
||||||
when:
|
when:
|
||||||
- kolla_action != "config"
|
- kolla_action != "config"
|
||||||
|
listen:
|
||||||
|
- Bootstrap etcd cluster
|
||||||
|
|
||||||
|
- name: Look up the cluster leader
|
||||||
|
include_tasks: 'lookup_leader.yml'
|
||||||
|
when:
|
||||||
|
- kolla_action != "config"
|
||||||
|
listen:
|
||||||
|
- Restart etcd container
|
||||||
|
- Bootstrap etcd services
|
||||||
|
- Bootstrap etcd cluster
|
||||||
|
- Check for deleted members
|
||||||
|
|
||||||
|
- name: Bootstrap etcd on new services
|
||||||
|
include_tasks: 'bootstrap_services.yml'
|
||||||
|
when:
|
||||||
|
- groups.etcd_had_volume_False is defined
|
||||||
|
- inventory_hostname in groups.etcd_had_volume_False
|
||||||
|
- kolla_action != "config"
|
||||||
|
listen:
|
||||||
|
- Bootstrap etcd services
|
||||||
|
|
||||||
|
- name: Rolling restart of etcd non-leaders
|
||||||
|
include_tasks: 'restart_services.yml'
|
||||||
|
when:
|
||||||
|
- inventory_hostname not in (groups.etcd_is_leader_True | default([]))
|
||||||
|
- groups.etcd.index(inventory_hostname) % 4 == item
|
||||||
|
- kolla_action != "config"
|
||||||
|
listen:
|
||||||
|
- Restart etcd container
|
||||||
|
- Bootstrap etcd services
|
||||||
|
- Bootstrap etcd cluster
|
||||||
|
loop:
|
||||||
|
- 0
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 3
|
||||||
|
|
||||||
|
- name: Restart etcd leader
|
||||||
|
include_tasks: 'restart_services.yml'
|
||||||
|
when:
|
||||||
|
- inventory_hostname in (groups.etcd_is_leader_True | default([]))
|
||||||
|
- kolla_action != "config"
|
||||||
|
listen:
|
||||||
|
- Restart etcd container
|
||||||
|
- Bootstrap etcd services
|
||||||
|
- Bootstrap etcd cluster
|
||||||
|
|
||||||
|
- name: Remove deleted members
|
||||||
|
include_tasks: 'remove_deleted_members.yml'
|
||||||
|
when:
|
||||||
|
- kolla_action != "config"
|
||||||
|
listen:
|
||||||
|
- Check for deleted members
|
||||||
|
25
ansible/roles/etcd/tasks/bootstrap.yml
Normal file
25
ansible/roles/etcd/tasks/bootstrap.yml
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
---
|
||||||
|
- import_tasks: lookup_cluster.yml
|
||||||
|
|
||||||
|
# NOTE(jan.gutter): The following two tasks set facts that aren't really used.
|
||||||
|
# They serve the purpose to trigger the handlers for bootstrapping:
|
||||||
|
# If no etcd data volumes exist, bootstrap a new initial cluster.
|
||||||
|
# If some volumes exist, add the new nodes to an existing cluster.
|
||||||
|
|
||||||
|
- name: Determine whether a new cluster needs bootstrapping
|
||||||
|
set_fact:
|
||||||
|
etcd_bootstrap_cluster: "{% for host in groups['etcd'] %}{{ hostvars[host].ansible_facts.hostname }}={{ etcd_protocol }}://{{ 'api' | kolla_address(host) | put_address_in_context('url') }}:{{ etcd_peer_port }}{% if not loop.last %},{% endif %}{% endfor %}"
|
||||||
|
when: not (etcd_cluster_exists | bool)
|
||||||
|
changed_when: not (etcd_cluster_exists | bool)
|
||||||
|
notify: Bootstrap etcd cluster
|
||||||
|
|
||||||
|
- name: Determine when new services need bootstrapping
|
||||||
|
set_fact:
|
||||||
|
etcd_bootstrap_services: "{% for host in groups['etcd_had_volume_False'] %}{{ hostvars[host].ansible_facts.hostname }}={{ etcd_protocol }}://{{ 'api' | kolla_address(host) | put_address_in_context('url') }}:{{ etcd_peer_port }}{% if not loop.last %},{% endif %}{% endfor %}"
|
||||||
|
when:
|
||||||
|
- etcd_cluster_exists | bool
|
||||||
|
- groups.etcd_had_volume_False is defined
|
||||||
|
changed_when:
|
||||||
|
- etcd_cluster_exists | bool
|
||||||
|
- groups.etcd_had_volume_False is defined
|
||||||
|
notify: Bootstrap etcd services
|
60
ansible/roles/etcd/tasks/bootstrap_cluster.yml
Normal file
60
ansible/roles/etcd/tasks/bootstrap_cluster.yml
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
---
|
||||||
|
- name: Bootstrapping etcd cluster
|
||||||
|
vars:
|
||||||
|
service_name: "etcd"
|
||||||
|
service: "{{ etcd_services[service_name] }}"
|
||||||
|
become: true
|
||||||
|
kolla_container:
|
||||||
|
action: "start_container"
|
||||||
|
common_options: "{{ docker_common_options }}"
|
||||||
|
environment:
|
||||||
|
KOLLA_BOOTSTRAP_STATUS: "bootstrap cluster"
|
||||||
|
ETCD_INITIAL_CLUSTER_STATE: "new"
|
||||||
|
ETCD_INITIAL_ADVERTISE_PEER_URLS: "{{ etcd_peer_internal_endpoint }}"
|
||||||
|
ETCD_INITIAL_CLUSTER_TOKEN: "{{ etcd_cluster_token }}"
|
||||||
|
ETCD_INITIAL_CLUSTER: "{% for host in groups['etcd'] %}{{ hostvars[host].ansible_facts.hostname }}={{ etcd_protocol }}://{{ 'api' | kolla_address(host) | put_address_in_context('url') }}:{{ etcd_peer_port }}{% if not loop.last %},{% endif %}{% endfor %}"
|
||||||
|
ETCDCTL_API: "3"
|
||||||
|
ETCDCTL_ENDPOINTS: "{{ etcd_client_internal_endpoint }}"
|
||||||
|
ETCDCTL_WRITE_OUT: "json"
|
||||||
|
ETCD_DATA_DIR: "/var/lib/etcd"
|
||||||
|
ETCD_NAME: "{{ ansible_facts.hostname }}"
|
||||||
|
ETCD_ADVERTISE_CLIENT_URLS: "{{ etcd_client_internal_endpoint }}"
|
||||||
|
ETCD_LISTEN_CLIENT_URLS: "{{ etcd_client_internal_endpoint }}"
|
||||||
|
ETCD_LISTEN_PEER_URLS: "{{ etcd_peer_internal_endpoint }}"
|
||||||
|
ETCD_OUT_FILE: "/var/log/kolla/etcd/etcd.log"
|
||||||
|
KOLLA_CONFIG_STRATEGY: "{{ config_strategy }}"
|
||||||
|
ETCD_CERT_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-cert.pem{% endif %}"
|
||||||
|
ETCD_KEY_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-key.pem{% endif %}"
|
||||||
|
ETCD_PEER_CERT_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-cert.pem{% endif %}"
|
||||||
|
ETCD_PEER_KEY_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-key.pem{% endif %}"
|
||||||
|
image: "{{ service.image }}"
|
||||||
|
name: "{{ service.container_name }}"
|
||||||
|
volumes: "{{ service.volumes }}"
|
||||||
|
dimensions: "{{ service.dimensions }}"
|
||||||
|
|
||||||
|
- name: Wait for etcd service port liveness
|
||||||
|
wait_for:
|
||||||
|
host: "{{ api_interface_address }}"
|
||||||
|
port: "{{ etcd_client_port }}"
|
||||||
|
connect_timeout: 1
|
||||||
|
timeout: 60
|
||||||
|
register: check_etcd_port
|
||||||
|
until: check_etcd_port is success
|
||||||
|
retries: 10
|
||||||
|
delay: 6
|
||||||
|
|
||||||
|
- name: Wait for etcd endpoints to be healthy
|
||||||
|
become: true
|
||||||
|
vars:
|
||||||
|
service_name: "etcd"
|
||||||
|
service: "{{ etcd_services[service_name] }}"
|
||||||
|
command: >-
|
||||||
|
{{ kolla_container_engine }} exec {{ service.container_name }}
|
||||||
|
etcdctl endpoint health
|
||||||
|
changed_when: false
|
||||||
|
register: result
|
||||||
|
until:
|
||||||
|
- result is success
|
||||||
|
- ((result.stdout | from_json | first)['health'] | default(False) | bool)
|
||||||
|
retries: 10
|
||||||
|
delay: 6
|
55
ansible/roles/etcd/tasks/bootstrap_services.yml
Normal file
55
ansible/roles/etcd/tasks/bootstrap_services.yml
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
---
|
||||||
|
- name: Add new member to etcd cluster
|
||||||
|
vars:
|
||||||
|
service_name: "etcd"
|
||||||
|
service: "{{ etcd_services[service_name] }}"
|
||||||
|
become: true
|
||||||
|
command: >-
|
||||||
|
{{ kolla_container_engine }} exec {{ service.container_name }}
|
||||||
|
etcdctl member add {{ ansible_facts.hostname }}
|
||||||
|
--peer-urls={{ etcd_protocol }}://{{ 'api' | kolla_address(inventory_hostname) | put_address_in_context('url') }}:{{ etcd_peer_port }}
|
||||||
|
delegate_to: "{{ etcd_cluster_leader | default(groups[service.group][0]) }}"
|
||||||
|
|
||||||
|
- name: Bootstrapping etcd containers
|
||||||
|
vars:
|
||||||
|
service_name: "etcd"
|
||||||
|
service: "{{ etcd_services[service_name] }}"
|
||||||
|
become: true
|
||||||
|
kolla_container:
|
||||||
|
action: "start_container"
|
||||||
|
common_options: "{{ docker_common_options }}"
|
||||||
|
environment:
|
||||||
|
KOLLA_BOOTSTRAP_STATUS: "bootstrap service"
|
||||||
|
ETCD_INITIAL_CLUSTER_STATE: "existing"
|
||||||
|
ETCD_INITIAL_ADVERTISE_PEER_URLS: "{{ etcd_peer_internal_endpoint }}"
|
||||||
|
ETCD_INITIAL_CLUSTER_TOKEN: "{{ etcd_cluster_token }}"
|
||||||
|
ETCD_INITIAL_CLUSTER: "{% for host in groups['etcd_had_volume_True'] %}{{ hostvars[host].ansible_facts.hostname }}={{ etcd_protocol }}://{{ 'api' | kolla_address(host) | put_address_in_context('url') }}:{{ etcd_peer_port }},{% endfor %}{{ ansible_facts.hostname }}={{ etcd_protocol }}://{{ 'api' | kolla_address(inventory_hostname) | put_address_in_context('url') }}:{{ etcd_peer_port }}"
|
||||||
|
ETCDCTL_API: "3"
|
||||||
|
ETCDCTL_ENDPOINTS: "{{ etcd_client_internal_endpoint }}"
|
||||||
|
ETCDCTL_WRITE_OUT: "json"
|
||||||
|
ETCD_DATA_DIR: "/var/lib/etcd"
|
||||||
|
ETCD_NAME: "{{ ansible_facts.hostname }}"
|
||||||
|
ETCD_ADVERTISE_CLIENT_URLS: "{{ etcd_client_internal_endpoint }}"
|
||||||
|
ETCD_LISTEN_CLIENT_URLS: "{{ etcd_client_internal_endpoint }}"
|
||||||
|
ETCD_LISTEN_PEER_URLS: "{{ etcd_peer_internal_endpoint }}"
|
||||||
|
ETCD_OUT_FILE: "/var/log/kolla/etcd/etcd.log"
|
||||||
|
KOLLA_CONFIG_STRATEGY: "{{ config_strategy }}"
|
||||||
|
ETCD_CERT_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-cert.pem{% endif %}"
|
||||||
|
ETCD_KEY_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-key.pem{% endif %}"
|
||||||
|
ETCD_PEER_CERT_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-cert.pem{% endif %}"
|
||||||
|
ETCD_PEER_KEY_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-key.pem{% endif %}"
|
||||||
|
image: "{{ service.image }}"
|
||||||
|
name: "{{ service.container_name }}"
|
||||||
|
volumes: "{{ service.volumes }}"
|
||||||
|
dimensions: "{{ service.dimensions }}"
|
||||||
|
|
||||||
|
- name: Wait for etcd service port liveness
|
||||||
|
wait_for:
|
||||||
|
host: "{{ api_interface_address }}"
|
||||||
|
port: "{{ etcd_client_port }}"
|
||||||
|
connect_timeout: 1
|
||||||
|
timeout: 60
|
||||||
|
register: check_etcd_client_port
|
||||||
|
until: check_etcd_client_port is success
|
||||||
|
retries: 10
|
||||||
|
delay: 6
|
@ -3,5 +3,7 @@
|
|||||||
|
|
||||||
- import_tasks: check-containers.yml
|
- import_tasks: check-containers.yml
|
||||||
|
|
||||||
|
- import_tasks: bootstrap.yml
|
||||||
|
|
||||||
- name: Flush handlers
|
- name: Flush handlers
|
||||||
meta: flush_handlers
|
meta: flush_handlers
|
||||||
|
26
ansible/roles/etcd/tasks/lookup_cluster.yml
Normal file
26
ansible/roles/etcd/tasks/lookup_cluster.yml
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
---
|
||||||
|
- name: Ensure etcd volume
|
||||||
|
become: true
|
||||||
|
kolla_container:
|
||||||
|
action: "create_volume"
|
||||||
|
common_options: "{{ docker_common_options }}"
|
||||||
|
name: "kolla_etcd"
|
||||||
|
register: etcd_volume
|
||||||
|
|
||||||
|
# NOTE(jan.gutter): If the play is interrupted before properly bootstrapping,
|
||||||
|
# we will incorrectly assume that an etcd cluster exists. This likely requires
|
||||||
|
# manual intervention to unwedge. If a volume exists we must assume there's
|
||||||
|
# data on it.
|
||||||
|
|
||||||
|
- name: Divide hosts by their etcd volume availability
|
||||||
|
group_by:
|
||||||
|
key: etcd_had_volume_{{ etcd_volume is not changed }}
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Establish whether the cluster has already existed
|
||||||
|
set_fact:
|
||||||
|
etcd_cluster_exists: "{{ groups.etcd_had_volume_True is defined }}"
|
||||||
|
changed_when:
|
||||||
|
- etcd_remove_deleted_members | bool
|
||||||
|
- groups.etcd_had_volume_True is defined
|
||||||
|
notify: Check for deleted members
|
41
ansible/roles/etcd/tasks/lookup_leader.yml
Normal file
41
ansible/roles/etcd/tasks/lookup_leader.yml
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
---
|
||||||
|
# NOTE(jan.gutter): These tasks assume a cluster is running
|
||||||
|
- name: Check for the etcd leader
|
||||||
|
vars:
|
||||||
|
service_name: "etcd"
|
||||||
|
service: "{{ etcd_services[service_name] }}"
|
||||||
|
become: true
|
||||||
|
# NOTE(jan.gutter): We need to set the ETCD environment vars here to
|
||||||
|
# handle an upgrade scenario from older etcd containers. These can be
|
||||||
|
# removed once the new workflow has been in place for a cycle or two.
|
||||||
|
command: >-
|
||||||
|
{{ kolla_container_engine }} exec
|
||||||
|
-e ETCDCTL_API=3
|
||||||
|
-e ETCDCTL_ENDPOINTS="{{ etcd_client_internal_endpoint }}"
|
||||||
|
-e ETCDCTL_WRITE_OUT="json"
|
||||||
|
{{ service.container_name }}
|
||||||
|
etcdctl endpoint status
|
||||||
|
changed_when: false
|
||||||
|
when:
|
||||||
|
- inventory_hostname in (groups.etcd_had_volume_True | default([]))
|
||||||
|
register: etcd_endpoint_status_result
|
||||||
|
|
||||||
|
- name: Divide hosts by their etcd leader status
|
||||||
|
vars:
|
||||||
|
etcd_endpoint_status: >-
|
||||||
|
{{ etcd_endpoint_status_result.stdout | default('[]') | from_json }}
|
||||||
|
etcd_member_id: >-
|
||||||
|
{{ etcd_endpoint_status[0]['Status']['header']['member_id']
|
||||||
|
| default('') }}
|
||||||
|
etcd_leader_id: >-
|
||||||
|
{{ etcd_endpoint_status[0]['Status']['leader']
|
||||||
|
| default('none') }}
|
||||||
|
group_by:
|
||||||
|
key: etcd_is_leader_{{ etcd_member_id == etcd_leader_id }}
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Set the etcd cluster leader
|
||||||
|
set_fact:
|
||||||
|
etcd_cluster_leader: "{{ groups.etcd_is_leader_True | sort | first }}"
|
||||||
|
when: groups.etcd_is_leader_True is defined
|
||||||
|
changed_when: false
|
39
ansible/roles/etcd/tasks/remove_deleted_members.yml
Normal file
39
ansible/roles/etcd/tasks/remove_deleted_members.yml
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
---
|
||||||
|
- name: List the etcd members
|
||||||
|
vars:
|
||||||
|
service_name: "etcd"
|
||||||
|
service: "{{ etcd_services[service_name] }}"
|
||||||
|
become: true
|
||||||
|
command: >-
|
||||||
|
{{ kolla_container_engine }} exec {{ service.container_name }}
|
||||||
|
etcdctl member list
|
||||||
|
changed_when: false
|
||||||
|
run_once: true
|
||||||
|
delegate_to: "{{ etcd_cluster_leader | default(groups[service.group][0]) }}"
|
||||||
|
register: etcd_member_list_result
|
||||||
|
|
||||||
|
- name: Remove deleted members from the etcd cluster
|
||||||
|
vars:
|
||||||
|
service_name: "etcd"
|
||||||
|
service: "{{ etcd_services[service_name] }}"
|
||||||
|
etcd_members_from_inventory: >-
|
||||||
|
{{ groups['etcd']
|
||||||
|
| map('extract', hostvars, 'ansible_facts')
|
||||||
|
| map(attribute='hostname')
|
||||||
|
| list }}
|
||||||
|
etcd_deleted_members: >-
|
||||||
|
{{ etcd_member_list_result.stdout | from_json
|
||||||
|
| json_query('members[].name')
|
||||||
|
| difference(etcd_members_from_inventory) }}
|
||||||
|
etcd_member_id: >-
|
||||||
|
{{ etcd_member_list_result.stdout | from_json
|
||||||
|
| json_query('members[].{key: name, value: ID}') | items2dict }}
|
||||||
|
become: true
|
||||||
|
command: >-
|
||||||
|
{{ kolla_container_engine }} exec {{ service.container_name }}
|
||||||
|
etcdctl member remove {{ '%x' % etcd_member_id[etcd_deleted_member] }}
|
||||||
|
run_once: true
|
||||||
|
delegate_to: "{{ etcd_cluster_leader | default(groups[service.group][0]) }}"
|
||||||
|
loop: "{{ etcd_deleted_members }}"
|
||||||
|
loop_control:
|
||||||
|
loop_var: etcd_deleted_member
|
25
ansible/roles/etcd/tasks/restart_services.yml
Normal file
25
ansible/roles/etcd/tasks/restart_services.yml
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
---
|
||||||
|
- name: Restart etcd container
|
||||||
|
vars:
|
||||||
|
service_name: "etcd"
|
||||||
|
service: "{{ etcd_services[service_name] }}"
|
||||||
|
become: true
|
||||||
|
kolla_container:
|
||||||
|
action: "recreate_or_restart_container"
|
||||||
|
common_options: "{{ docker_common_options }}"
|
||||||
|
name: "{{ service.container_name }}"
|
||||||
|
image: "{{ service.image }}"
|
||||||
|
volumes: "{{ service.volumes }}"
|
||||||
|
dimensions: "{{ service.dimensions }}"
|
||||||
|
environment: "{{ service.environment }}"
|
||||||
|
|
||||||
|
- name: Wait for etcd service port liveness
|
||||||
|
wait_for:
|
||||||
|
host: "{{ api_interface_address }}"
|
||||||
|
port: "{{ etcd_client_port }}"
|
||||||
|
connect_timeout: 1
|
||||||
|
timeout: 60
|
||||||
|
register: check_etcd_client_port
|
||||||
|
until: check_etcd_client_port is success
|
||||||
|
retries: 10
|
||||||
|
delay: 6
|
@ -458,7 +458,6 @@
|
|||||||
hosts:
|
hosts:
|
||||||
- etcd
|
- etcd
|
||||||
- '&enable_etcd_True'
|
- '&enable_etcd_True'
|
||||||
serial: '{{ kolla_serial|default("0") }}'
|
|
||||||
roles:
|
roles:
|
||||||
- { role: etcd,
|
- { role: etcd,
|
||||||
tags: etcd }
|
tags: etcd }
|
||||||
|
97
doc/source/admin/etcd.rst
Normal file
97
doc/source/admin/etcd.rst
Normal file
@ -0,0 +1,97 @@
|
|||||||
|
.. etcd:
|
||||||
|
|
||||||
|
=============
|
||||||
|
Managing etcd
|
||||||
|
=============
|
||||||
|
|
||||||
|
Kolla Ansible can manage the lifecycle of an etcd cluster and supports the
|
||||||
|
following operations:
|
||||||
|
|
||||||
|
* Bootstrapping a clean multi-node etcd cluster
|
||||||
|
* Adding a new member to the etcd cluster
|
||||||
|
* Optionally, automatically removing a deleted node from the etcd cluster.
|
||||||
|
|
||||||
|
It is highly recommended to read the operator documentation for the version
|
||||||
|
of etcd deployed in the cluster.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Once an etcd cluster is bootstrapped, the etcd service takes most of its
|
||||||
|
configuration from the etcd database itself.
|
||||||
|
|
||||||
|
This pattern is very different from many other Kolla Ansible services, and
|
||||||
|
is a source of confusion for operators unfamiliar with etcd.
|
||||||
|
|
||||||
|
Cluster vs Node Bootstrapping
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Kolla Ansible distinguishes between two forms of bootstrapping in an etcd
|
||||||
|
cluster:
|
||||||
|
|
||||||
|
* Bootstrapping multiple nodes at the same time to bring up a new cluster
|
||||||
|
* Bootstrapping a single node to add it to an existing cluster
|
||||||
|
|
||||||
|
These corresponds to the `new` and `existing` parameters for
|
||||||
|
`ETCD_INITIAL_CLUSTER_STATE` in the upstream documentation. Once an etcd node
|
||||||
|
has completed bootstrap, the bootstrap configuration is ignored, even if it is
|
||||||
|
changed.
|
||||||
|
|
||||||
|
Kolla Ansible will decide to perform a new cluster bootstrap if it detects that
|
||||||
|
there is no existing data on the etcd nodes. Otherwise it assumes that there is
|
||||||
|
a healthy etcd cluster and it will add a new node to it.
|
||||||
|
|
||||||
|
Forcing Bootstrapping
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Kolla Ansible looks for the `kolla_etcd` volume on the node. If this volume
|
||||||
|
is available, it assumes that the bootstrap process has run on the node and
|
||||||
|
the volume contains the required config.
|
||||||
|
|
||||||
|
However, if the process was interrupted (externally, or by an error), this
|
||||||
|
volume might be misconfigured. In order to prevent dataloss, manual
|
||||||
|
intervention is required.
|
||||||
|
|
||||||
|
Before retriggering bootstrap make sure that there is no valuable data on the
|
||||||
|
volume. This could be because the node was not in service, or that the data
|
||||||
|
is persisted elsewhere.
|
||||||
|
|
||||||
|
To retrigger a bootstrap (for either the cluster, or for a single node),
|
||||||
|
remove the volume, from all affected nodes:
|
||||||
|
|
||||||
|
``docker volume rm kolla_etcd``
|
||||||
|
|
||||||
|
Rerunning Kolla Ansible will then trigger the appropriate workflow and either
|
||||||
|
a blank cluster will be bootstrapped, or an empty member will be added to
|
||||||
|
the existing cluster.
|
||||||
|
|
||||||
|
Manual Commands
|
||||||
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
In order to manage etcd manually, the ``etcdctl`` command can be used inside
|
||||||
|
the `etcd` container. This command has been set up with the appropriate
|
||||||
|
environment variables for integrating with automation.
|
||||||
|
|
||||||
|
``etcdctl`` is configured with json output by default:
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
# list cluster members in a human-readable table
|
||||||
|
docker exec -it etcd etcdctl -w table member list
|
||||||
|
|
||||||
|
Removing Dead Nodes
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
If ``globals.yml`` has the value ``etcd_remove_deleted_members: "yes"`` then
|
||||||
|
etcd nodes that are not in the inventory will be removed from the etcd cluster.
|
||||||
|
|
||||||
|
Any errors in the inventory can therefore cause unintended removal.
|
||||||
|
|
||||||
|
To manually remove a dead node from the etcd cluster, use the following
|
||||||
|
commands:
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
# list cluster members and identify dead member
|
||||||
|
docker exec -it etcd etcdctl -w table member list
|
||||||
|
# remove dead member
|
||||||
|
docker exec -it etcd etcdctl member remove MEMBER_ID_IN_HEX
|
@ -9,5 +9,6 @@ Admin Guides
|
|||||||
tls
|
tls
|
||||||
acme
|
acme
|
||||||
mariadb-backup-and-restore
|
mariadb-backup-and-restore
|
||||||
|
etcd
|
||||||
production-architecture-guide
|
production-architecture-guide
|
||||||
deployment-philosophy
|
deployment-philosophy
|
||||||
|
@ -173,6 +173,14 @@ For each host, clean up its services:
|
|||||||
|
|
||||||
.. _removing-existing-compute-nodes:
|
.. _removing-existing-compute-nodes:
|
||||||
|
|
||||||
|
If the node is also running the `etcd` service, set
|
||||||
|
``etcd_remove_deleted_members: "yes"`` in `globals.yml` to automatically
|
||||||
|
remove nodes from the `etcd` cluster that have been removed from the inventory.
|
||||||
|
|
||||||
|
Alternatively the `etcd` members can be removed manually with `etcdctl`. For
|
||||||
|
more details, please consult the `runtime reconfiguration` documentation
|
||||||
|
section for the version of etcd in operation.
|
||||||
|
|
||||||
Removing existing compute nodes
|
Removing existing compute nodes
|
||||||
-------------------------------
|
-------------------------------
|
||||||
|
|
||||||
|
@ -903,3 +903,10 @@ workaround_ansible_issue_8743: yes
|
|||||||
|
|
||||||
# this is UDP port
|
# this is UDP port
|
||||||
#hacluster_corosync_port: 5405
|
#hacluster_corosync_port: 5405
|
||||||
|
|
||||||
|
##############
|
||||||
|
# etcd options
|
||||||
|
##############
|
||||||
|
# If `etcd_remove_deleted_members` is enabled, Kolla Ansible will automatically
|
||||||
|
# remove etcd members from the cluster that are no longer in the inventory.
|
||||||
|
#etcd_remove_deleted_members: "no"
|
||||||
|
12
releasenotes/notes/managed-etcd-72fb2d3fbba516d9.yaml
Normal file
12
releasenotes/notes/managed-etcd-72fb2d3fbba516d9.yaml
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
---
|
||||||
|
fixes:
|
||||||
|
- |
|
||||||
|
The `etcd` tooling has been updated to better serialize restarts when
|
||||||
|
applying configuration or updates. Previously minor outages might occur
|
||||||
|
since all services were restarted in the same task.
|
||||||
|
- |
|
||||||
|
The `etcd` tooling has been updated to handle adding and removing nodes.
|
||||||
|
Previously this was an undocumented manual process and required creating
|
||||||
|
service containers. Operators can refer to the
|
||||||
|
`etcd admin guide <https://docs.openstack.org/kolla-ansible/latest/admin/etcd.html>`__
|
||||||
|
for more details.
|
@ -52,7 +52,7 @@ function prepare_images {
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ $SCENARIO == "cephadm" ]]; then
|
if [[ $SCENARIO == "cephadm" ]]; then
|
||||||
GATE_IMAGES+=",^cinder"
|
GATE_IMAGES+=",^cinder,^etcd"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ $SCENARIO == "cells" ]]; then
|
if [[ $SCENARIO == "cells" ]]; then
|
||||||
|
@ -77,6 +77,7 @@ openstack_tag_suffix: "{{ docker_image_tag_suffix }}"
|
|||||||
enable_zun: "yes"
|
enable_zun: "yes"
|
||||||
enable_kuryr: "yes"
|
enable_kuryr: "yes"
|
||||||
enable_etcd: "yes"
|
enable_etcd: "yes"
|
||||||
|
etcd_remove_deleted_members: "yes"
|
||||||
docker_configure_for_zun: "yes"
|
docker_configure_for_zun: "yes"
|
||||||
containerd_configure_for_zun: "yes"
|
containerd_configure_for_zun: "yes"
|
||||||
enable_cinder: "yes"
|
enable_cinder: "yes"
|
||||||
@ -132,6 +133,9 @@ enable_cinder: "yes"
|
|||||||
glance_backend_ceph: "yes"
|
glance_backend_ceph: "yes"
|
||||||
cinder_backend_ceph: "yes"
|
cinder_backend_ceph: "yes"
|
||||||
nova_backend_ceph: "yes"
|
nova_backend_ceph: "yes"
|
||||||
|
# Internal etcd
|
||||||
|
enable_etcd: "yes"
|
||||||
|
etcd_remove_deleted_members: "yes"
|
||||||
|
|
||||||
enable_ceph_rgw: "yes"
|
enable_ceph_rgw: "yes"
|
||||||
ceph_rgw_hosts:
|
ceph_rgw_hosts:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user