From 7cc4bf62031d293487b1714448ba7bb65ed324d6 Mon Sep 17 00:00:00 2001 From: Michal Nasiadka Date: Fri, 30 Dec 2022 15:19:27 +0000 Subject: [PATCH] ovn: Improve clustering Currently clustering steps are very static, if for a reason first node in the inventory fails and gets re-introduced - K-A will create a second empty cluster on that node. This patch changes the approach and checks if cluster exists, if it does - chooses a donor for the new node from currently running node set. Also it fixes node replacement - it removes old node from cluster (that has the same ip address as newly provisioned node). Closes-Bug: #1875223 Change-Id: Ia025283e38ea7c3bd37c7a70d03f6b46c68f4456 --- .../library/kolla_container_volume_facts.py | 91 ++++++++++++ ansible/roles/ovn-db/defaults/main.yml | 21 +++ .../roles/ovn-db/tasks/bootstrap-initial.yml | 83 +++++++++++ ansible/roles/ovn-db/tasks/deploy.yml | 10 ++ ansible/roles/ovn-db/tasks/lookup_cluster.yml | 131 ++++++++++++++++++ .../roles/ovn-db/templates/ovn-nb-db.json.j2 | 2 +- .../roles/ovn-db/templates/ovn-sb-db.json.j2 | 2 +- .../notes/1875223-05552108375d005a.yaml | 5 + 8 files changed, 343 insertions(+), 2 deletions(-) create mode 100644 ansible/library/kolla_container_volume_facts.py create mode 100644 ansible/roles/ovn-db/tasks/bootstrap-initial.yml create mode 100644 ansible/roles/ovn-db/tasks/lookup_cluster.yml create mode 100644 releasenotes/notes/1875223-05552108375d005a.yaml diff --git a/ansible/library/kolla_container_volume_facts.py b/ansible/library/kolla_container_volume_facts.py new file mode 100644 index 0000000000..b1f471d229 --- /dev/null +++ b/ansible/library/kolla_container_volume_facts.py @@ -0,0 +1,91 @@ +# Copyright 2023 StackHPC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import docker + +from ansible.module_utils.basic import AnsibleModule + +DOCUMENTATION = ''' +--- +module: kolla_container_volume_facts +short_description: Module for collecting Docker container volume facts +description: + - A module targeted at collecting Docker container volume facts. It is used + for detecting whether the container volume exists on a host. +options: + container_engine: + description: + - Name of container engine to use + required: True + type: str + api_version: + description: + - The version of the api for docker-py to use when contacting docker + required: False + type: str + default: auto + name: + description: + - Name or names of the container volumes + required: False + type: str or list +author: Jeffrey Zhang / Michal Nasiadka +''' + +EXAMPLES = ''' +- hosts: all + tasks: + - name: Gather docker facts + kolla_container_volume_facts: + + - name: Gather glance container facts + kolla_container_volume_facts: + container_engine: docker + name: + - glance_api + - glance_registry +''' + + +def get_docker_client(): + return docker.APIClient + + +def main(): + argument_spec = dict( + name=dict(required=False, type='list', default=[]), + api_version=dict(required=False, type='str', default='auto'), + container_engine=dict(required=True, type='str') + ) + + module = AnsibleModule(argument_spec=argument_spec) + + results = dict(changed=False, _volumes=[]) + client = get_docker_client()(version=module.params.get('api_version')) + volumes = client.volumes() + names = module.params.get('name') + if names and not isinstance(names, list): + names = [names] + for volume in volumes['Volumes']: + volume_name = volume['Name'] + if names and volume_name not in names: + continue + results['_volumes'].append(volume) + results[volume_name] = volume + module.exit_json(**results) + + +if __name__ == "__main__": + main() diff --git a/ansible/roles/ovn-db/defaults/main.yml b/ansible/roles/ovn-db/defaults/main.yml index ab97210185..57735c37cc 100644 --- a/ansible/roles/ovn-db/defaults/main.yml +++ b/ansible/roles/ovn-db/defaults/main.yml @@ -75,3 +75,24 @@ ovn_openflow_probe_interval: "60" ovn_db_inactivity_probe: "60000" ovn_sb_db_inactivity_probe: "{{ ovn_db_inactivity_probe }}" ovn_nb_db_inactivity_probe: "{{ ovn_db_inactivity_probe }}" +# OVN startup commands +ovn_nb_command: >- + /usr/share/ovn/scripts/ovn-ctl run_nb_ovsdb + --db-nb-addr={{ api_interface_address | put_address_in_context('url') }} + --db-nb-cluster-local-addr={{ api_interface_address | put_address_in_context('url') }} + {{ ovn_nb_db_bootstrap_args | default('') }} + --db-nb-sock=/run/ovn/ovnnb_db.sock + --db-nb-pidfile=/run/ovn/ovnnb_db.pid + --db-nb-file=/var/lib/openvswitch/ovn-nb/ovnnb.db + --ovn-nb-logfile=/var/log/kolla/openvswitch/ovn-nb-db.log +ovn_sb_command: >- + /usr/share/ovn/scripts/ovn-ctl run_sb_ovsdb + --db-sb-addr={{ api_interface_address | put_address_in_context('url') }} + --db-sb-cluster-local-addr={{ api_interface_address | put_address_in_context('url') }} + {{ ovn_sb_db_bootstrap_args | default('') }} + --db-sb-sock=/run/ovn/ovnsb_db.sock + --db-sb-pidfile=/run/ovn/ovnsb_db.pid + --db-sb-file=/var/lib/openvswitch/ovn-sb/ovnsb.db + --ovn-sb-logfile=/var/log/kolla/openvswitch/ovn-sb-db.log + +ovn_db_path: "{{ '/var/run/openvswitch' if kolla_base_distro in ['debian', 'ubuntu'] else '/var/run/ovn' }}" diff --git a/ansible/roles/ovn-db/tasks/bootstrap-initial.yml b/ansible/roles/ovn-db/tasks/bootstrap-initial.yml new file mode 100644 index 0000000000..ba0d10c56e --- /dev/null +++ b/ansible/roles/ovn-db/tasks/bootstrap-initial.yml @@ -0,0 +1,83 @@ +--- +- name: Bootstrap new cluster + block: + + - name: Set bootstrap args fact for NB (new cluster) + set_fact: + ovn_nb_db_bootstrap_args: "{% if groups['ovn-nb-db'] | length > 1 and inventory_hostname != groups['ovn-nb-db'][0] %} --db-nb-cluster-remote-addr={{ 'api' | kolla_address(groups['ovn-nb-db'][0]) | put_address_in_context('url') }} {% endif %}" + when: groups['ovn-nb-db_leader'] is not defined and groups['ovn-nb-db_follower'] is not defined + + - name: Set bootstrap args fact for SB (new cluster) + set_fact: + ovn_sb_db_bootstrap_args: "{% if groups['ovn-sb-db'] | length > 1 and inventory_hostname != groups['ovn-sb-db'][0] %} --db-sb-cluster-remote-addr={{ 'api' | kolla_address(groups['ovn-sb-db'][0]) | put_address_in_context('url') }} {% endif %}" + when: groups['ovn-sb-db_leader'] is not defined and groups['ovn-sb-db_follower'] is not defined + + - name: Check NB cluster status + command: > + {{ kolla_container_engine }} exec ovn_nb_db ovs-appctl -t {{ ovn_db_path }}/ovnnb_db.ctl + cluster/status OVN_Northbound + become: true + changed_when: false + register: ovn_nb_db_cluster_status + when: groups['ovn-nb-db_leader'] is defined and inventory_hostname in groups.get('ovn-nb-db_had_volume_False', '') + delegate_to: "{{ groups['ovn-nb-db_leader'][0] }}" + + - name: Check SB cluster status + command: > + {{ kolla_container_engine }} exec ovn_sb_db ovs-appctl -t {{ ovn_db_path }}/ovnsb_db.ctl + cluster/status OVN_Southbound + become: true + changed_when: false + register: ovn_sb_db_cluster_status + when: groups['ovn-sb-db_leader'] is defined and inventory_hostname in groups.get('ovn-sb-db_had_volume_False', '') + delegate_to: "{{ groups['ovn-sb-db_leader'][0] }}" + + - name: Remove an old node with the same ip address as the new node in NB DB + vars: + ovn_nb_old_node: "{{ ovn_nb_db_cluster_status | regex_search('\\((\\w{4}) at tcp:' + api_interface_address + ':6643\\)', '\\1') | first }}" + become: true + command: > + {{ kolla_container_engine }} exec ovn_nb_db ovs-appctl -t {{ ovn_db_path }}/ovnnb_db.ctl + cluster/kick OVN_Northbound {{ ovn_nb_old_node }} + when: + - ovn_nb_db_cluster_status.stdout is defined + - (ovn_nb_db_cluster_status.stdout is search('at tcp:' + api_interface_address)) and inventory_hostname in groups.get('ovn-nb-db_had_volume_False', '') + delegate_to: "{{ groups['ovn-nb-db_leader'][0] }}" + + - name: Remove an old node with the same ip address as the new node in SB DB + vars: + ovn_sb_old_node: "{{ ovn_sb_db_cluster_status | regex_search('\\((\\w{4}) at tcp:' + api_interface_address + ':6644\\)', '\\1') | first }}" + become: true + command: > + {{ kolla_container_engine }} exec ovn_sb_db ovs-appctl -t {{ ovn_db_path }}/ovnsb_db.ctl + cluster/kick OVN_Southbound {{ ovn_sb_old_node }} + when: + - ovn_sb_db_cluster_status.stdout is defined + - (ovn_sb_db_cluster_status.stdout is search('at tcp:' + api_interface_address)) and inventory_hostname in groups.get('ovn-sb-db_had_volume_False', '') + delegate_to: "{{ groups['ovn-sb-db_leader'][0] }}" + + - name: Set bootstrap args fact for NB (new member) + set_fact: + ovn_nb_db_bootstrap_args: "--db-nb-cluster-remote-addr={{ 'api' | kolla_address(groups.get('ovn-nb-db_leader', groups['ovn-nb-db'])[0] | default()) | put_address_in_context('url') }}" + when: inventory_hostname in groups.get('ovn-nb-db_had_volume_False', '') and groups['ovn-nb-db_leader'] is defined + + - name: Set bootstrap args fact for SB (new member) + set_fact: + ovn_sb_db_bootstrap_args: "--db-sb-cluster-remote-addr={{ 'api' | kolla_address(groups.get('ovn-sb-db_leader', groups['ovn-sb-db'])[0] | default()) | put_address_in_context('url') }}" + when: inventory_hostname in groups.get('ovn-sb-db_had_volume_False', '') and groups['ovn-sb-db_leader'] is defined + + - import_tasks: config.yml + + - import_tasks: check-containers.yml + + - name: Flush handlers + meta: flush_handlers + + - import_tasks: bootstrap-db.yml + + - name: Unset bootstrap args fact + set_fact: + ovn_nb_db_bootstrap_args: + ovn_sb_db_bootstrap_args: + + any_errors_fatal: true diff --git a/ansible/roles/ovn-db/tasks/deploy.yml b/ansible/roles/ovn-db/tasks/deploy.yml index 1c68ca7eca..d92bb7b614 100644 --- a/ansible/roles/ovn-db/tasks/deploy.yml +++ b/ansible/roles/ovn-db/tasks/deploy.yml @@ -1,4 +1,14 @@ --- +- include_tasks: lookup_cluster.yml + when: + - inventory_hostname in groups['ovn-nb-db'] or + inventory_hostname in groups['ovn-sb-db'] + +- include_tasks: bootstrap-initial.yml + when: + - inventory_hostname in groups.get('ovn-nb-db_had_volume_False', '') or + inventory_hostname in groups.get('ovn-sb-db_had_volume_False', '') + - import_tasks: config.yml - import_tasks: check-containers.yml diff --git a/ansible/roles/ovn-db/tasks/lookup_cluster.yml b/ansible/roles/ovn-db/tasks/lookup_cluster.yml new file mode 100644 index 0000000000..15711427a7 --- /dev/null +++ b/ansible/roles/ovn-db/tasks/lookup_cluster.yml @@ -0,0 +1,131 @@ +--- +- name: Checking for any existing OVN DB container volumes + become: true + kolla_container_volume_facts: + container_engine: "{{ kolla_container_engine }}" + name: + - ovn_nb_db + - ovn_sb_db + register: ovn_db_container_volume_facts + +- name: Divide hosts by their OVN NB volume availability + group_by: + key: "ovn-nb-db_had_volume_{{ ovn_db_container_volume_facts['ovn_nb_db'] is defined }}" + changed_when: false + +- name: Divide hosts by their OVN SB volume availability + group_by: + key: "ovn-sb-db_had_volume_{{ ovn_db_container_volume_facts['ovn_sb_db'] is defined }}" + changed_when: false + +- name: Establish whether the OVN NB cluster has already existed + set_fact: + ovn_nb_db_cluster_exists: "{{ groups['ovn-nb-db' + '_had_volume_True'] is defined }}" + +- name: Establish whether the OVN SB cluster has already existed + set_fact: + ovn_sb_db_cluster_exists: "{{ groups['ovn-sb-db' + '_had_volume_True'] is defined }}" + +- name: OVN NB checks + block: + + - name: Check if running on all OVN NB DB hosts + fail: + msg: > + Some hosts ({{ groups['ovn-nb-db'] | join(', ') }}) need database + bootstrapping, but not all OVN NB DB hosts are in the target + list. Stopping as it may be unsafe to proceed. Please run without --limit + or --serial to bootstrap these hosts. + when: + - ovn_nb_db_cluster_exists + - groups['ovn-nb-db'] | difference(ansible_play_batch) | list | length > 0 + + - name: Check OVN NB service port liveness + wait_for: + host: "{{ api_interface_address }}" + port: "{{ ovn_nb_db_port }}" + connect_timeout: 1 + timeout: 10 + register: check_ovn_nb_db_port_liveness + ignore_errors: yes + + - name: Divide hosts by their OVN NB service port liveness + group_by: + key: "ovn-nb-db_port_alive_{{ check_ovn_nb_db_port_liveness is success }}" + changed_when: false + + - name: Get OVN NB database information + command: > + {{ kolla_container_engine }} exec ovn_nb_db ovsdb-client query unix:/run/ovn/ovnnb_db.sock + "[\"_Server\",{\"table\":\"Database\",\"where\":[[\"name\",\"==\", \"OVN_Northbound\"]],\"op\":\"select\"}]" + become: true + when: check_ovn_nb_db_port_liveness is success + changed_when: false + register: ovn_nb_db_info + + - name: Divide hosts by their OVN NB leader/follower role + group_by: + key: "ovn-nb-db_{{ 'leader' if (ovn_nb_db_info.stdout | from_json).0.rows.0.leader else 'follower' }}" + when: check_ovn_nb_db_port_liveness is success + changed_when: false + + - name: Fail on existing OVN NB cluster with no leader + fail: + msg: OVN NB cluster exists but there is no leader - please check cluster status + when: + - groups['ovn-nb-db_leader'] is not defined and groups['ovn-nb-db_follower'] is defined + + any_errors_fatal: true + when: inventory_hostname in groups.get('ovn-nb-db_had_volume_True', '') + +- name: OVN SB checks + block: + + - name: Check if running on all OVN SB DB hosts + fail: + msg: > + Some hosts ({{ groups['ovn-sb-db'] | join(', ') }}) need database + bootstrapping, but not all OVN SB DB hosts are in the target + list. Stopping as it may be unsafe to proceed. Please run without --limit + or --serial to bootstrap these hosts. + when: + - ovn_sb_db_cluster_exists + - groups['ovn-sb-db'] | difference(ansible_play_batch) | list | length > 0 + + - name: Check OVN SB service port liveness + wait_for: + host: "{{ api_interface_address }}" + port: "{{ ovn_sb_db_port }}" + connect_timeout: 1 + timeout: 10 + register: check_ovn_sb_db_port_liveness + ignore_errors: yes + + - name: Divide hosts by their OVN SB service port liveness + group_by: + key: "ovn-sb-db_port_alive_{{ check_ovn_sb_db_port_liveness is success }}" + changed_when: false + + - name: Get OVN SB database information + command: > + {{ kolla_container_engine }} exec ovn_sb_db ovsdb-client query unix:/run/ovn/ovnsb_db.sock + "[\"_Server\",{\"table\":\"Database\",\"where\":[[\"name\",\"==\", \"OVN_Southbound\"]],\"op\":\"select\"}]" + become: true + when: check_ovn_sb_db_port_liveness is success + changed_when: false + register: ovn_sb_db_info + + - name: Divide hosts by their OVN SB leader/follower role + group_by: + key: "ovn-sb-db_{{ 'leader' if (ovn_sb_db_info.stdout | from_json).0.rows.0.leader else 'follower' }}" + when: check_ovn_sb_db_port_liveness is success + changed_when: false + + - name: Fail on existing OVN SB cluster with no leader + fail: + msg: OVN SB cluster exists but there is no leader - please check cluster status. + when: + - groups['ovn-sb-db_leader'] is not defined and groups['ovn-sb-db_follower'] is defined + + any_errors_fatal: true + when: inventory_hostname in groups.get('ovn-sb-db_had_volume_True', '') diff --git a/ansible/roles/ovn-db/templates/ovn-nb-db.json.j2 b/ansible/roles/ovn-db/templates/ovn-nb-db.json.j2 index e1a542a685..d3ff684b66 100644 --- a/ansible/roles/ovn-db/templates/ovn-nb-db.json.j2 +++ b/ansible/roles/ovn-db/templates/ovn-nb-db.json.j2 @@ -1,5 +1,5 @@ { - "command": "/usr/share/ovn/scripts/ovn-ctl run_nb_ovsdb --db-nb-addr={{ api_interface_address | put_address_in_context('url') }} --db-nb-cluster-local-addr={{ api_interface_address | put_address_in_context('url') }} {% if groups['ovn-nb-db'] | length > 1 and inventory_hostname != groups['ovn-nb-db'][0] %} --db-nb-cluster-remote-addr={{ 'api' | kolla_address(groups['ovn-nb-db'][0]) | put_address_in_context('url') }} {% endif %} --db-nb-sock=/run/ovn/ovnnb_db.sock --db-nb-pidfile=/run/ovn/ovnnb_db.pid --db-nb-file=/var/lib/openvswitch/ovn-nb/ovnnb.db --ovn-nb-logfile=/var/log/kolla/openvswitch/ovn-nb-db.log", + "command": "{{ ovn_nb_command }}", "permissions": [ { "path": "/var/log/kolla/openvswitch", diff --git a/ansible/roles/ovn-db/templates/ovn-sb-db.json.j2 b/ansible/roles/ovn-db/templates/ovn-sb-db.json.j2 index 6a0305171e..4139f58c3f 100644 --- a/ansible/roles/ovn-db/templates/ovn-sb-db.json.j2 +++ b/ansible/roles/ovn-db/templates/ovn-sb-db.json.j2 @@ -1,5 +1,5 @@ { - "command": "/usr/share/ovn/scripts/ovn-ctl run_sb_ovsdb --db-sb-addr={{ api_interface_address | put_address_in_context('url') }} --db-sb-cluster-local-addr={{ api_interface_address | put_address_in_context('url') }} {% if groups['ovn-sb-db'] | length > 1 and inventory_hostname != groups['ovn-sb-db'][0] %} --db-sb-cluster-remote-addr={{ 'api' | kolla_address(groups['ovn-sb-db'][0]) | put_address_in_context('url') }} {% endif %} --db-sb-sock=/run/ovn/ovnsb_db.sock --db-sb-pidfile=/run/ovn/ovnsb_db.pid --db-sb-file=/var/lib/openvswitch/ovn-sb/ovnsb.db --ovn-sb-logfile=/var/log/kolla/openvswitch/ovn-sb-db.log", + "command": "{{ ovn_sb_command }}", "permissions": [ { "path": "/var/log/kolla/openvswitch", diff --git a/releasenotes/notes/1875223-05552108375d005a.yaml b/releasenotes/notes/1875223-05552108375d005a.yaml new file mode 100644 index 0000000000..84f61b224f --- /dev/null +++ b/releasenotes/notes/1875223-05552108375d005a.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Fixes issues with OVN NB/SB DB deployment, where first node needs to be + rebootstrapped. `LP#1875223 `__