etcd: Add support for more scenarios
This commit addresses a few shortcomings in the etcd service: * Adding or removing etcd nodes required manual intervention. * The etcd service would have brief outages during upgrades or reconfigures because restarts weren't always serialised. This makes the etcd service follow a similar pattern to mariadb: * There is now a distiction between bootstrapping the cluster and adding / removing another member. * This more closely follows etcd's upstream bootstrapping guidelines. * The etcd role now serialises restarts internally so the kolla_serial pattern is no longer appropriate (or necessary). This does not remove the need for manual intervention in all failure modes: the documentation has been updated to address the most common issues. Note that there's repetition in the container specifications: this is somewhat deliberate. In a future cleanup, it's intended to reduce the duplication. Change-Id: I39829ba0c5894f8e549f9b83b416e6db4fafd96f
This commit is contained in:
parent
db79eb0a55
commit
ed3b27cc92
@ -5,15 +5,18 @@ etcd_services:
|
||||
group: etcd
|
||||
enabled: true
|
||||
environment:
|
||||
# KOLLA_BOOTSTRAP_STATUS is used to indicate whether the container should
|
||||
# be recreated. Otherwise the kolla_container task doesn't detect that the
|
||||
# environment has changed if variables are removed.
|
||||
KOLLA_BOOTSTRAP_STATUS: "bootstrap completed"
|
||||
ETCDCTL_API: "3"
|
||||
ETCDCTL_ENDPOINTS: "{{ etcd_client_internal_endpoint }}"
|
||||
ETCDCTL_WRITE_OUT: "json"
|
||||
ETCD_DATA_DIR: "/var/lib/etcd"
|
||||
ETCD_NAME: "{{ ansible_facts.hostname }}"
|
||||
ETCD_ADVERTISE_CLIENT_URLS: "{{ etcd_client_internal_endpoint }}"
|
||||
ETCD_LISTEN_CLIENT_URLS: "{{ etcd_client_internal_endpoint }}"
|
||||
ETCD_INITIAL_ADVERTISE_PEER_URLS: "{{ etcd_peer_internal_endpoint }}"
|
||||
ETCD_LISTEN_PEER_URLS: "{{ etcd_peer_internal_endpoint }}"
|
||||
ETCD_INITIAL_CLUSTER_TOKEN: "{{ etcd_cluster_token }}"
|
||||
ETCD_INITIAL_CLUSTER: "{% for host in groups['etcd'] %}{{ hostvars[host].ansible_facts.hostname }}={{ etcd_protocol }}://{{ 'api' | kolla_address(host) | put_address_in_context('url') }}:{{ etcd_peer_port }}{% if not loop.last %},{% endif %}{% endfor %}"
|
||||
ETCD_INITIAL_CLUSTER_STATE: "new"
|
||||
ETCD_OUT_FILE: "/var/log/kolla/etcd/etcd.log"
|
||||
KOLLA_CONFIG_STRATEGY: "{{ config_strategy }}"
|
||||
ETCD_CERT_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-cert.pem{% endif %}"
|
||||
@ -52,3 +55,8 @@ etcd_extra_volumes: "{{ default_extra_volumes }}"
|
||||
############
|
||||
etcd_client_internal_endpoint: "{{ etcd_protocol }}://{{ api_interface_address | put_address_in_context('url') }}:{{ etcd_client_port }}"
|
||||
etcd_peer_internal_endpoint: "{{ etcd_protocol }}://{{ api_interface_address | put_address_in_context('url') }}:{{ etcd_peer_port }}"
|
||||
|
||||
###################
|
||||
# Managing members
|
||||
###################
|
||||
etcd_remove_deleted_members: "no"
|
||||
|
@ -1,16 +1,59 @@
|
||||
---
|
||||
- name: Restart etcd container
|
||||
vars:
|
||||
service_name: "etcd"
|
||||
service: "{{ etcd_services[service_name] }}"
|
||||
become: true
|
||||
kolla_container:
|
||||
action: "recreate_or_restart_container"
|
||||
common_options: "{{ docker_common_options }}"
|
||||
name: "{{ service.container_name }}"
|
||||
image: "{{ service.image }}"
|
||||
environment: "{{ service.environment }}"
|
||||
volumes: "{{ service.volumes }}"
|
||||
dimensions: "{{ service.dimensions }}"
|
||||
- name: Bootstrap etcd on new cluster
|
||||
include_tasks: 'bootstrap_cluster.yml'
|
||||
when:
|
||||
- kolla_action != "config"
|
||||
listen:
|
||||
- Bootstrap etcd cluster
|
||||
|
||||
- name: Look up the cluster leader
|
||||
include_tasks: 'lookup_leader.yml'
|
||||
when:
|
||||
- kolla_action != "config"
|
||||
listen:
|
||||
- Restart etcd container
|
||||
- Bootstrap etcd services
|
||||
- Bootstrap etcd cluster
|
||||
- Check for deleted members
|
||||
|
||||
- name: Bootstrap etcd on new services
|
||||
include_tasks: 'bootstrap_services.yml'
|
||||
when:
|
||||
- groups.etcd_had_volume_False is defined
|
||||
- inventory_hostname in groups.etcd_had_volume_False
|
||||
- kolla_action != "config"
|
||||
listen:
|
||||
- Bootstrap etcd services
|
||||
|
||||
- name: Rolling restart of etcd non-leaders
|
||||
include_tasks: 'restart_services.yml'
|
||||
when:
|
||||
- inventory_hostname not in (groups.etcd_is_leader_True | default([]))
|
||||
- groups.etcd.index(inventory_hostname) % 4 == item
|
||||
- kolla_action != "config"
|
||||
listen:
|
||||
- Restart etcd container
|
||||
- Bootstrap etcd services
|
||||
- Bootstrap etcd cluster
|
||||
loop:
|
||||
- 0
|
||||
- 1
|
||||
- 2
|
||||
- 3
|
||||
|
||||
- name: Restart etcd leader
|
||||
include_tasks: 'restart_services.yml'
|
||||
when:
|
||||
- inventory_hostname in (groups.etcd_is_leader_True | default([]))
|
||||
- kolla_action != "config"
|
||||
listen:
|
||||
- Restart etcd container
|
||||
- Bootstrap etcd services
|
||||
- Bootstrap etcd cluster
|
||||
|
||||
- name: Remove deleted members
|
||||
include_tasks: 'remove_deleted_members.yml'
|
||||
when:
|
||||
- kolla_action != "config"
|
||||
listen:
|
||||
- Check for deleted members
|
||||
|
25
ansible/roles/etcd/tasks/bootstrap.yml
Normal file
25
ansible/roles/etcd/tasks/bootstrap.yml
Normal file
@ -0,0 +1,25 @@
|
||||
---
|
||||
- import_tasks: lookup_cluster.yml
|
||||
|
||||
# NOTE(jan.gutter): The following two tasks set facts that aren't really used.
|
||||
# They serve the purpose to trigger the handlers for bootstrapping:
|
||||
# If no etcd data volumes exist, bootstrap a new initial cluster.
|
||||
# If some volumes exist, add the new nodes to an existing cluster.
|
||||
|
||||
- name: Determine whether a new cluster needs bootstrapping
|
||||
set_fact:
|
||||
etcd_bootstrap_cluster: "{% for host in groups['etcd'] %}{{ hostvars[host].ansible_facts.hostname }}={{ etcd_protocol }}://{{ 'api' | kolla_address(host) | put_address_in_context('url') }}:{{ etcd_peer_port }}{% if not loop.last %},{% endif %}{% endfor %}"
|
||||
when: not (etcd_cluster_exists | bool)
|
||||
changed_when: not (etcd_cluster_exists | bool)
|
||||
notify: Bootstrap etcd cluster
|
||||
|
||||
- name: Determine when new services need bootstrapping
|
||||
set_fact:
|
||||
etcd_bootstrap_services: "{% for host in groups['etcd_had_volume_False'] %}{{ hostvars[host].ansible_facts.hostname }}={{ etcd_protocol }}://{{ 'api' | kolla_address(host) | put_address_in_context('url') }}:{{ etcd_peer_port }}{% if not loop.last %},{% endif %}{% endfor %}"
|
||||
when:
|
||||
- etcd_cluster_exists | bool
|
||||
- groups.etcd_had_volume_False is defined
|
||||
changed_when:
|
||||
- etcd_cluster_exists | bool
|
||||
- groups.etcd_had_volume_False is defined
|
||||
notify: Bootstrap etcd services
|
60
ansible/roles/etcd/tasks/bootstrap_cluster.yml
Normal file
60
ansible/roles/etcd/tasks/bootstrap_cluster.yml
Normal file
@ -0,0 +1,60 @@
|
||||
---
|
||||
- name: Bootstrapping etcd cluster
|
||||
vars:
|
||||
service_name: "etcd"
|
||||
service: "{{ etcd_services[service_name] }}"
|
||||
become: true
|
||||
kolla_container:
|
||||
action: "start_container"
|
||||
common_options: "{{ docker_common_options }}"
|
||||
environment:
|
||||
KOLLA_BOOTSTRAP_STATUS: "bootstrap cluster"
|
||||
ETCD_INITIAL_CLUSTER_STATE: "new"
|
||||
ETCD_INITIAL_ADVERTISE_PEER_URLS: "{{ etcd_peer_internal_endpoint }}"
|
||||
ETCD_INITIAL_CLUSTER_TOKEN: "{{ etcd_cluster_token }}"
|
||||
ETCD_INITIAL_CLUSTER: "{% for host in groups['etcd'] %}{{ hostvars[host].ansible_facts.hostname }}={{ etcd_protocol }}://{{ 'api' | kolla_address(host) | put_address_in_context('url') }}:{{ etcd_peer_port }}{% if not loop.last %},{% endif %}{% endfor %}"
|
||||
ETCDCTL_API: "3"
|
||||
ETCDCTL_ENDPOINTS: "{{ etcd_client_internal_endpoint }}"
|
||||
ETCDCTL_WRITE_OUT: "json"
|
||||
ETCD_DATA_DIR: "/var/lib/etcd"
|
||||
ETCD_NAME: "{{ ansible_facts.hostname }}"
|
||||
ETCD_ADVERTISE_CLIENT_URLS: "{{ etcd_client_internal_endpoint }}"
|
||||
ETCD_LISTEN_CLIENT_URLS: "{{ etcd_client_internal_endpoint }}"
|
||||
ETCD_LISTEN_PEER_URLS: "{{ etcd_peer_internal_endpoint }}"
|
||||
ETCD_OUT_FILE: "/var/log/kolla/etcd/etcd.log"
|
||||
KOLLA_CONFIG_STRATEGY: "{{ config_strategy }}"
|
||||
ETCD_CERT_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-cert.pem{% endif %}"
|
||||
ETCD_KEY_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-key.pem{% endif %}"
|
||||
ETCD_PEER_CERT_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-cert.pem{% endif %}"
|
||||
ETCD_PEER_KEY_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-key.pem{% endif %}"
|
||||
image: "{{ service.image }}"
|
||||
name: "{{ service.container_name }}"
|
||||
volumes: "{{ service.volumes }}"
|
||||
dimensions: "{{ service.dimensions }}"
|
||||
|
||||
- name: Wait for etcd service port liveness
|
||||
wait_for:
|
||||
host: "{{ api_interface_address }}"
|
||||
port: "{{ etcd_client_port }}"
|
||||
connect_timeout: 1
|
||||
timeout: 60
|
||||
register: check_etcd_port
|
||||
until: check_etcd_port is success
|
||||
retries: 10
|
||||
delay: 6
|
||||
|
||||
- name: Wait for etcd endpoints to be healthy
|
||||
become: true
|
||||
vars:
|
||||
service_name: "etcd"
|
||||
service: "{{ etcd_services[service_name] }}"
|
||||
command: >-
|
||||
{{ kolla_container_engine }} exec {{ service.container_name }}
|
||||
etcdctl endpoint health
|
||||
changed_when: false
|
||||
register: result
|
||||
until:
|
||||
- result is success
|
||||
- ((result.stdout | from_json | first)['health'] | default(False) | bool)
|
||||
retries: 10
|
||||
delay: 6
|
55
ansible/roles/etcd/tasks/bootstrap_services.yml
Normal file
55
ansible/roles/etcd/tasks/bootstrap_services.yml
Normal file
@ -0,0 +1,55 @@
|
||||
---
|
||||
- name: Add new member to etcd cluster
|
||||
vars:
|
||||
service_name: "etcd"
|
||||
service: "{{ etcd_services[service_name] }}"
|
||||
become: true
|
||||
command: >-
|
||||
{{ kolla_container_engine }} exec {{ service.container_name }}
|
||||
etcdctl member add {{ ansible_facts.hostname }}
|
||||
--peer-urls={{ etcd_protocol }}://{{ 'api' | kolla_address(inventory_hostname) | put_address_in_context('url') }}:{{ etcd_peer_port }}
|
||||
delegate_to: "{{ etcd_cluster_leader | default(groups[service.group][0]) }}"
|
||||
|
||||
- name: Bootstrapping etcd containers
|
||||
vars:
|
||||
service_name: "etcd"
|
||||
service: "{{ etcd_services[service_name] }}"
|
||||
become: true
|
||||
kolla_container:
|
||||
action: "start_container"
|
||||
common_options: "{{ docker_common_options }}"
|
||||
environment:
|
||||
KOLLA_BOOTSTRAP_STATUS: "bootstrap service"
|
||||
ETCD_INITIAL_CLUSTER_STATE: "existing"
|
||||
ETCD_INITIAL_ADVERTISE_PEER_URLS: "{{ etcd_peer_internal_endpoint }}"
|
||||
ETCD_INITIAL_CLUSTER_TOKEN: "{{ etcd_cluster_token }}"
|
||||
ETCD_INITIAL_CLUSTER: "{% for host in groups['etcd_had_volume_True'] %}{{ hostvars[host].ansible_facts.hostname }}={{ etcd_protocol }}://{{ 'api' | kolla_address(host) | put_address_in_context('url') }}:{{ etcd_peer_port }},{% endfor %}{{ ansible_facts.hostname }}={{ etcd_protocol }}://{{ 'api' | kolla_address(inventory_hostname) | put_address_in_context('url') }}:{{ etcd_peer_port }}"
|
||||
ETCDCTL_API: "3"
|
||||
ETCDCTL_ENDPOINTS: "{{ etcd_client_internal_endpoint }}"
|
||||
ETCDCTL_WRITE_OUT: "json"
|
||||
ETCD_DATA_DIR: "/var/lib/etcd"
|
||||
ETCD_NAME: "{{ ansible_facts.hostname }}"
|
||||
ETCD_ADVERTISE_CLIENT_URLS: "{{ etcd_client_internal_endpoint }}"
|
||||
ETCD_LISTEN_CLIENT_URLS: "{{ etcd_client_internal_endpoint }}"
|
||||
ETCD_LISTEN_PEER_URLS: "{{ etcd_peer_internal_endpoint }}"
|
||||
ETCD_OUT_FILE: "/var/log/kolla/etcd/etcd.log"
|
||||
KOLLA_CONFIG_STRATEGY: "{{ config_strategy }}"
|
||||
ETCD_CERT_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-cert.pem{% endif %}"
|
||||
ETCD_KEY_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-key.pem{% endif %}"
|
||||
ETCD_PEER_CERT_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-cert.pem{% endif %}"
|
||||
ETCD_PEER_KEY_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-key.pem{% endif %}"
|
||||
image: "{{ service.image }}"
|
||||
name: "{{ service.container_name }}"
|
||||
volumes: "{{ service.volumes }}"
|
||||
dimensions: "{{ service.dimensions }}"
|
||||
|
||||
- name: Wait for etcd service port liveness
|
||||
wait_for:
|
||||
host: "{{ api_interface_address }}"
|
||||
port: "{{ etcd_client_port }}"
|
||||
connect_timeout: 1
|
||||
timeout: 60
|
||||
register: check_etcd_client_port
|
||||
until: check_etcd_client_port is success
|
||||
retries: 10
|
||||
delay: 6
|
@ -3,5 +3,7 @@
|
||||
|
||||
- import_tasks: check-containers.yml
|
||||
|
||||
- import_tasks: bootstrap.yml
|
||||
|
||||
- name: Flush handlers
|
||||
meta: flush_handlers
|
||||
|
26
ansible/roles/etcd/tasks/lookup_cluster.yml
Normal file
26
ansible/roles/etcd/tasks/lookup_cluster.yml
Normal file
@ -0,0 +1,26 @@
|
||||
---
|
||||
- name: Ensure etcd volume
|
||||
become: true
|
||||
kolla_container:
|
||||
action: "create_volume"
|
||||
common_options: "{{ docker_common_options }}"
|
||||
name: "kolla_etcd"
|
||||
register: etcd_volume
|
||||
|
||||
# NOTE(jan.gutter): If the play is interrupted before properly bootstrapping,
|
||||
# we will incorrectly assume that an etcd cluster exists. This likely requires
|
||||
# manual intervention to unwedge. If a volume exists we must assume there's
|
||||
# data on it.
|
||||
|
||||
- name: Divide hosts by their etcd volume availability
|
||||
group_by:
|
||||
key: etcd_had_volume_{{ etcd_volume is not changed }}
|
||||
changed_when: false
|
||||
|
||||
- name: Establish whether the cluster has already existed
|
||||
set_fact:
|
||||
etcd_cluster_exists: "{{ groups.etcd_had_volume_True is defined }}"
|
||||
changed_when:
|
||||
- etcd_remove_deleted_members | bool
|
||||
- groups.etcd_had_volume_True is defined
|
||||
notify: Check for deleted members
|
41
ansible/roles/etcd/tasks/lookup_leader.yml
Normal file
41
ansible/roles/etcd/tasks/lookup_leader.yml
Normal file
@ -0,0 +1,41 @@
|
||||
---
|
||||
# NOTE(jan.gutter): These tasks assume a cluster is running
|
||||
- name: Check for the etcd leader
|
||||
vars:
|
||||
service_name: "etcd"
|
||||
service: "{{ etcd_services[service_name] }}"
|
||||
become: true
|
||||
# NOTE(jan.gutter): We need to set the ETCD environment vars here to
|
||||
# handle an upgrade scenario from older etcd containers. These can be
|
||||
# removed once the new workflow has been in place for a cycle or two.
|
||||
command: >-
|
||||
{{ kolla_container_engine }} exec
|
||||
-e ETCDCTL_API=3
|
||||
-e ETCDCTL_ENDPOINTS="{{ etcd_client_internal_endpoint }}"
|
||||
-e ETCDCTL_WRITE_OUT="json"
|
||||
{{ service.container_name }}
|
||||
etcdctl endpoint status
|
||||
changed_when: false
|
||||
when:
|
||||
- inventory_hostname in (groups.etcd_had_volume_True | default([]))
|
||||
register: etcd_endpoint_status_result
|
||||
|
||||
- name: Divide hosts by their etcd leader status
|
||||
vars:
|
||||
etcd_endpoint_status: >-
|
||||
{{ etcd_endpoint_status_result.stdout | default('[]') | from_json }}
|
||||
etcd_member_id: >-
|
||||
{{ etcd_endpoint_status[0]['Status']['header']['member_id']
|
||||
| default('') }}
|
||||
etcd_leader_id: >-
|
||||
{{ etcd_endpoint_status[0]['Status']['leader']
|
||||
| default('none') }}
|
||||
group_by:
|
||||
key: etcd_is_leader_{{ etcd_member_id == etcd_leader_id }}
|
||||
changed_when: false
|
||||
|
||||
- name: Set the etcd cluster leader
|
||||
set_fact:
|
||||
etcd_cluster_leader: "{{ groups.etcd_is_leader_True | sort | first }}"
|
||||
when: groups.etcd_is_leader_True is defined
|
||||
changed_when: false
|
39
ansible/roles/etcd/tasks/remove_deleted_members.yml
Normal file
39
ansible/roles/etcd/tasks/remove_deleted_members.yml
Normal file
@ -0,0 +1,39 @@
|
||||
---
|
||||
- name: List the etcd members
|
||||
vars:
|
||||
service_name: "etcd"
|
||||
service: "{{ etcd_services[service_name] }}"
|
||||
become: true
|
||||
command: >-
|
||||
{{ kolla_container_engine }} exec {{ service.container_name }}
|
||||
etcdctl member list
|
||||
changed_when: false
|
||||
run_once: true
|
||||
delegate_to: "{{ etcd_cluster_leader | default(groups[service.group][0]) }}"
|
||||
register: etcd_member_list_result
|
||||
|
||||
- name: Remove deleted members from the etcd cluster
|
||||
vars:
|
||||
service_name: "etcd"
|
||||
service: "{{ etcd_services[service_name] }}"
|
||||
etcd_members_from_inventory: >-
|
||||
{{ groups['etcd']
|
||||
| map('extract', hostvars, 'ansible_facts')
|
||||
| map(attribute='hostname')
|
||||
| list }}
|
||||
etcd_deleted_members: >-
|
||||
{{ etcd_member_list_result.stdout | from_json
|
||||
| json_query('members[].name')
|
||||
| difference(etcd_members_from_inventory) }}
|
||||
etcd_member_id: >-
|
||||
{{ etcd_member_list_result.stdout | from_json
|
||||
| json_query('members[].{key: name, value: ID}') | items2dict }}
|
||||
become: true
|
||||
command: >-
|
||||
{{ kolla_container_engine }} exec {{ service.container_name }}
|
||||
etcdctl member remove {{ '%x' % etcd_member_id[etcd_deleted_member] }}
|
||||
run_once: true
|
||||
delegate_to: "{{ etcd_cluster_leader | default(groups[service.group][0]) }}"
|
||||
loop: "{{ etcd_deleted_members }}"
|
||||
loop_control:
|
||||
loop_var: etcd_deleted_member
|
25
ansible/roles/etcd/tasks/restart_services.yml
Normal file
25
ansible/roles/etcd/tasks/restart_services.yml
Normal file
@ -0,0 +1,25 @@
|
||||
---
|
||||
- name: Restart etcd container
|
||||
vars:
|
||||
service_name: "etcd"
|
||||
service: "{{ etcd_services[service_name] }}"
|
||||
become: true
|
||||
kolla_container:
|
||||
action: "recreate_or_restart_container"
|
||||
common_options: "{{ docker_common_options }}"
|
||||
name: "{{ service.container_name }}"
|
||||
image: "{{ service.image }}"
|
||||
volumes: "{{ service.volumes }}"
|
||||
dimensions: "{{ service.dimensions }}"
|
||||
environment: "{{ service.environment }}"
|
||||
|
||||
- name: Wait for etcd service port liveness
|
||||
wait_for:
|
||||
host: "{{ api_interface_address }}"
|
||||
port: "{{ etcd_client_port }}"
|
||||
connect_timeout: 1
|
||||
timeout: 60
|
||||
register: check_etcd_client_port
|
||||
until: check_etcd_client_port is success
|
||||
retries: 10
|
||||
delay: 6
|
@ -458,7 +458,6 @@
|
||||
hosts:
|
||||
- etcd
|
||||
- '&enable_etcd_True'
|
||||
serial: '{{ kolla_serial|default("0") }}'
|
||||
roles:
|
||||
- { role: etcd,
|
||||
tags: etcd }
|
||||
|
97
doc/source/admin/etcd.rst
Normal file
97
doc/source/admin/etcd.rst
Normal file
@ -0,0 +1,97 @@
|
||||
.. etcd:
|
||||
|
||||
=============
|
||||
Managing etcd
|
||||
=============
|
||||
|
||||
Kolla Ansible can manage the lifecycle of an etcd cluster and supports the
|
||||
following operations:
|
||||
|
||||
* Bootstrapping a clean multi-node etcd cluster
|
||||
* Adding a new member to the etcd cluster
|
||||
* Optionally, automatically removing a deleted node from the etcd cluster.
|
||||
|
||||
It is highly recommended to read the operator documentation for the version
|
||||
of etcd deployed in the cluster.
|
||||
|
||||
.. note::
|
||||
|
||||
Once an etcd cluster is bootstrapped, the etcd service takes most of its
|
||||
configuration from the etcd database itself.
|
||||
|
||||
This pattern is very different from many other Kolla Ansible services, and
|
||||
is a source of confusion for operators unfamiliar with etcd.
|
||||
|
||||
Cluster vs Node Bootstrapping
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Kolla Ansible distinguishes between two forms of bootstrapping in an etcd
|
||||
cluster:
|
||||
|
||||
* Bootstrapping multiple nodes at the same time to bring up a new cluster
|
||||
* Bootstrapping a single node to add it to an existing cluster
|
||||
|
||||
These corresponds to the `new` and `existing` parameters for
|
||||
`ETCD_INITIAL_CLUSTER_STATE` in the upstream documentation. Once an etcd node
|
||||
has completed bootstrap, the bootstrap configuration is ignored, even if it is
|
||||
changed.
|
||||
|
||||
Kolla Ansible will decide to perform a new cluster bootstrap if it detects that
|
||||
there is no existing data on the etcd nodes. Otherwise it assumes that there is
|
||||
a healthy etcd cluster and it will add a new node to it.
|
||||
|
||||
Forcing Bootstrapping
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Kolla Ansible looks for the `kolla_etcd` volume on the node. If this volume
|
||||
is available, it assumes that the bootstrap process has run on the node and
|
||||
the volume contains the required config.
|
||||
|
||||
However, if the process was interrupted (externally, or by an error), this
|
||||
volume might be misconfigured. In order to prevent dataloss, manual
|
||||
intervention is required.
|
||||
|
||||
Before retriggering bootstrap make sure that there is no valuable data on the
|
||||
volume. This could be because the node was not in service, or that the data
|
||||
is persisted elsewhere.
|
||||
|
||||
To retrigger a bootstrap (for either the cluster, or for a single node),
|
||||
remove the volume, from all affected nodes:
|
||||
|
||||
``docker volume rm kolla_etcd``
|
||||
|
||||
Rerunning Kolla Ansible will then trigger the appropriate workflow and either
|
||||
a blank cluster will be bootstrapped, or an empty member will be added to
|
||||
the existing cluster.
|
||||
|
||||
Manual Commands
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
In order to manage etcd manually, the ``etcdctl`` command can be used inside
|
||||
the `etcd` container. This command has been set up with the appropriate
|
||||
environment variables for integrating with automation.
|
||||
|
||||
``etcdctl`` is configured with json output by default:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
# list cluster members in a human-readable table
|
||||
docker exec -it etcd etcdctl -w table member list
|
||||
|
||||
Removing Dead Nodes
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
If ``globals.yml`` has the value ``etcd_remove_deleted_members: "yes"`` then
|
||||
etcd nodes that are not in the inventory will be removed from the etcd cluster.
|
||||
|
||||
Any errors in the inventory can therefore cause unintended removal.
|
||||
|
||||
To manually remove a dead node from the etcd cluster, use the following
|
||||
commands:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
# list cluster members and identify dead member
|
||||
docker exec -it etcd etcdctl -w table member list
|
||||
# remove dead member
|
||||
docker exec -it etcd etcdctl member remove MEMBER_ID_IN_HEX
|
@ -9,5 +9,6 @@ Admin Guides
|
||||
tls
|
||||
acme
|
||||
mariadb-backup-and-restore
|
||||
etcd
|
||||
production-architecture-guide
|
||||
deployment-philosophy
|
||||
|
@ -173,6 +173,14 @@ For each host, clean up its services:
|
||||
|
||||
.. _removing-existing-compute-nodes:
|
||||
|
||||
If the node is also running the `etcd` service, set
|
||||
``etcd_remove_deleted_members: "yes"`` in `globals.yml` to automatically
|
||||
remove nodes from the `etcd` cluster that have been removed from the inventory.
|
||||
|
||||
Alternatively the `etcd` members can be removed manually with `etcdctl`. For
|
||||
more details, please consult the `runtime reconfiguration` documentation
|
||||
section for the version of etcd in operation.
|
||||
|
||||
Removing existing compute nodes
|
||||
-------------------------------
|
||||
|
||||
|
@ -903,3 +903,10 @@ workaround_ansible_issue_8743: yes
|
||||
|
||||
# this is UDP port
|
||||
#hacluster_corosync_port: 5405
|
||||
|
||||
##############
|
||||
# etcd options
|
||||
##############
|
||||
# If `etcd_remove_deleted_members` is enabled, Kolla Ansible will automatically
|
||||
# remove etcd members from the cluster that are no longer in the inventory.
|
||||
#etcd_remove_deleted_members: "no"
|
||||
|
12
releasenotes/notes/managed-etcd-72fb2d3fbba516d9.yaml
Normal file
12
releasenotes/notes/managed-etcd-72fb2d3fbba516d9.yaml
Normal file
@ -0,0 +1,12 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
The `etcd` tooling has been updated to better serialize restarts when
|
||||
applying configuration or updates. Previously minor outages might occur
|
||||
since all services were restarted in the same task.
|
||||
- |
|
||||
The `etcd` tooling has been updated to handle adding and removing nodes.
|
||||
Previously this was an undocumented manual process and required creating
|
||||
service containers. Operators can refer to the
|
||||
`etcd admin guide <https://docs.openstack.org/kolla-ansible/latest/admin/etcd.html>`__
|
||||
for more details.
|
@ -52,7 +52,7 @@ function prepare_images {
|
||||
fi
|
||||
|
||||
if [[ $SCENARIO == "cephadm" ]]; then
|
||||
GATE_IMAGES+=",^cinder"
|
||||
GATE_IMAGES+=",^cinder,^etcd"
|
||||
fi
|
||||
|
||||
if [[ $SCENARIO == "cells" ]]; then
|
||||
|
@ -77,6 +77,7 @@ openstack_tag_suffix: "{{ docker_image_tag_suffix }}"
|
||||
enable_zun: "yes"
|
||||
enable_kuryr: "yes"
|
||||
enable_etcd: "yes"
|
||||
etcd_remove_deleted_members: "yes"
|
||||
docker_configure_for_zun: "yes"
|
||||
containerd_configure_for_zun: "yes"
|
||||
enable_cinder: "yes"
|
||||
@ -132,6 +133,9 @@ enable_cinder: "yes"
|
||||
glance_backend_ceph: "yes"
|
||||
cinder_backend_ceph: "yes"
|
||||
nova_backend_ceph: "yes"
|
||||
# Internal etcd
|
||||
enable_etcd: "yes"
|
||||
etcd_remove_deleted_members: "yes"
|
||||
|
||||
enable_ceph_rgw: "yes"
|
||||
ceph_rgw_hosts:
|
||||
|
Loading…
Reference in New Issue
Block a user