ovn: Improve clustering

Currently clustering steps are very static, if for a reason first
node in the inventory fails and gets re-introduced - K-A will create
a second empty cluster on that node.

This patch changes the approach and checks if cluster exists, if it
does - chooses a donor for the new node from currently running
node set.

Also it fixes node replacement - it removes old node from cluster
(that has the same ip address as newly provisioned node).

Closes-Bug: #1875223

Change-Id: Ia025283e38ea7c3bd37c7a70d03f6b46c68f4456
This commit is contained in:
Michal Nasiadka 2022-12-30 15:19:27 +00:00
parent 375ecdde07
commit 7cc4bf6203
8 changed files with 343 additions and 2 deletions

View File

@ -0,0 +1,91 @@
# Copyright 2023 StackHPC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import docker
from ansible.module_utils.basic import AnsibleModule
DOCUMENTATION = '''
---
module: kolla_container_volume_facts
short_description: Module for collecting Docker container volume facts
description:
- A module targeted at collecting Docker container volume facts. It is used
for detecting whether the container volume exists on a host.
options:
container_engine:
description:
- Name of container engine to use
required: True
type: str
api_version:
description:
- The version of the api for docker-py to use when contacting docker
required: False
type: str
default: auto
name:
description:
- Name or names of the container volumes
required: False
type: str or list
author: Jeffrey Zhang / Michal Nasiadka
'''
EXAMPLES = '''
- hosts: all
tasks:
- name: Gather docker facts
kolla_container_volume_facts:
- name: Gather glance container facts
kolla_container_volume_facts:
container_engine: docker
name:
- glance_api
- glance_registry
'''
def get_docker_client():
return docker.APIClient
def main():
argument_spec = dict(
name=dict(required=False, type='list', default=[]),
api_version=dict(required=False, type='str', default='auto'),
container_engine=dict(required=True, type='str')
)
module = AnsibleModule(argument_spec=argument_spec)
results = dict(changed=False, _volumes=[])
client = get_docker_client()(version=module.params.get('api_version'))
volumes = client.volumes()
names = module.params.get('name')
if names and not isinstance(names, list):
names = [names]
for volume in volumes['Volumes']:
volume_name = volume['Name']
if names and volume_name not in names:
continue
results['_volumes'].append(volume)
results[volume_name] = volume
module.exit_json(**results)
if __name__ == "__main__":
main()

View File

@ -75,3 +75,24 @@ ovn_openflow_probe_interval: "60"
ovn_db_inactivity_probe: "60000" ovn_db_inactivity_probe: "60000"
ovn_sb_db_inactivity_probe: "{{ ovn_db_inactivity_probe }}" ovn_sb_db_inactivity_probe: "{{ ovn_db_inactivity_probe }}"
ovn_nb_db_inactivity_probe: "{{ ovn_db_inactivity_probe }}" ovn_nb_db_inactivity_probe: "{{ ovn_db_inactivity_probe }}"
# OVN startup commands
ovn_nb_command: >-
/usr/share/ovn/scripts/ovn-ctl run_nb_ovsdb
--db-nb-addr={{ api_interface_address | put_address_in_context('url') }}
--db-nb-cluster-local-addr={{ api_interface_address | put_address_in_context('url') }}
{{ ovn_nb_db_bootstrap_args | default('') }}
--db-nb-sock=/run/ovn/ovnnb_db.sock
--db-nb-pidfile=/run/ovn/ovnnb_db.pid
--db-nb-file=/var/lib/openvswitch/ovn-nb/ovnnb.db
--ovn-nb-logfile=/var/log/kolla/openvswitch/ovn-nb-db.log
ovn_sb_command: >-
/usr/share/ovn/scripts/ovn-ctl run_sb_ovsdb
--db-sb-addr={{ api_interface_address | put_address_in_context('url') }}
--db-sb-cluster-local-addr={{ api_interface_address | put_address_in_context('url') }}
{{ ovn_sb_db_bootstrap_args | default('') }}
--db-sb-sock=/run/ovn/ovnsb_db.sock
--db-sb-pidfile=/run/ovn/ovnsb_db.pid
--db-sb-file=/var/lib/openvswitch/ovn-sb/ovnsb.db
--ovn-sb-logfile=/var/log/kolla/openvswitch/ovn-sb-db.log
ovn_db_path: "{{ '/var/run/openvswitch' if kolla_base_distro in ['debian', 'ubuntu'] else '/var/run/ovn' }}"

View File

@ -0,0 +1,83 @@
---
- name: Bootstrap new cluster
block:
- name: Set bootstrap args fact for NB (new cluster)
set_fact:
ovn_nb_db_bootstrap_args: "{% if groups['ovn-nb-db'] | length > 1 and inventory_hostname != groups['ovn-nb-db'][0] %} --db-nb-cluster-remote-addr={{ 'api' | kolla_address(groups['ovn-nb-db'][0]) | put_address_in_context('url') }} {% endif %}"
when: groups['ovn-nb-db_leader'] is not defined and groups['ovn-nb-db_follower'] is not defined
- name: Set bootstrap args fact for SB (new cluster)
set_fact:
ovn_sb_db_bootstrap_args: "{% if groups['ovn-sb-db'] | length > 1 and inventory_hostname != groups['ovn-sb-db'][0] %} --db-sb-cluster-remote-addr={{ 'api' | kolla_address(groups['ovn-sb-db'][0]) | put_address_in_context('url') }} {% endif %}"
when: groups['ovn-sb-db_leader'] is not defined and groups['ovn-sb-db_follower'] is not defined
- name: Check NB cluster status
command: >
{{ kolla_container_engine }} exec ovn_nb_db ovs-appctl -t {{ ovn_db_path }}/ovnnb_db.ctl
cluster/status OVN_Northbound
become: true
changed_when: false
register: ovn_nb_db_cluster_status
when: groups['ovn-nb-db_leader'] is defined and inventory_hostname in groups.get('ovn-nb-db_had_volume_False', '')
delegate_to: "{{ groups['ovn-nb-db_leader'][0] }}"
- name: Check SB cluster status
command: >
{{ kolla_container_engine }} exec ovn_sb_db ovs-appctl -t {{ ovn_db_path }}/ovnsb_db.ctl
cluster/status OVN_Southbound
become: true
changed_when: false
register: ovn_sb_db_cluster_status
when: groups['ovn-sb-db_leader'] is defined and inventory_hostname in groups.get('ovn-sb-db_had_volume_False', '')
delegate_to: "{{ groups['ovn-sb-db_leader'][0] }}"
- name: Remove an old node with the same ip address as the new node in NB DB
vars:
ovn_nb_old_node: "{{ ovn_nb_db_cluster_status | regex_search('\\((\\w{4}) at tcp:' + api_interface_address + ':6643\\)', '\\1') | first }}"
become: true
command: >
{{ kolla_container_engine }} exec ovn_nb_db ovs-appctl -t {{ ovn_db_path }}/ovnnb_db.ctl
cluster/kick OVN_Northbound {{ ovn_nb_old_node }}
when:
- ovn_nb_db_cluster_status.stdout is defined
- (ovn_nb_db_cluster_status.stdout is search('at tcp:' + api_interface_address)) and inventory_hostname in groups.get('ovn-nb-db_had_volume_False', '')
delegate_to: "{{ groups['ovn-nb-db_leader'][0] }}"
- name: Remove an old node with the same ip address as the new node in SB DB
vars:
ovn_sb_old_node: "{{ ovn_sb_db_cluster_status | regex_search('\\((\\w{4}) at tcp:' + api_interface_address + ':6644\\)', '\\1') | first }}"
become: true
command: >
{{ kolla_container_engine }} exec ovn_sb_db ovs-appctl -t {{ ovn_db_path }}/ovnsb_db.ctl
cluster/kick OVN_Southbound {{ ovn_sb_old_node }}
when:
- ovn_sb_db_cluster_status.stdout is defined
- (ovn_sb_db_cluster_status.stdout is search('at tcp:' + api_interface_address)) and inventory_hostname in groups.get('ovn-sb-db_had_volume_False', '')
delegate_to: "{{ groups['ovn-sb-db_leader'][0] }}"
- name: Set bootstrap args fact for NB (new member)
set_fact:
ovn_nb_db_bootstrap_args: "--db-nb-cluster-remote-addr={{ 'api' | kolla_address(groups.get('ovn-nb-db_leader', groups['ovn-nb-db'])[0] | default()) | put_address_in_context('url') }}"
when: inventory_hostname in groups.get('ovn-nb-db_had_volume_False', '') and groups['ovn-nb-db_leader'] is defined
- name: Set bootstrap args fact for SB (new member)
set_fact:
ovn_sb_db_bootstrap_args: "--db-sb-cluster-remote-addr={{ 'api' | kolla_address(groups.get('ovn-sb-db_leader', groups['ovn-sb-db'])[0] | default()) | put_address_in_context('url') }}"
when: inventory_hostname in groups.get('ovn-sb-db_had_volume_False', '') and groups['ovn-sb-db_leader'] is defined
- import_tasks: config.yml
- import_tasks: check-containers.yml
- name: Flush handlers
meta: flush_handlers
- import_tasks: bootstrap-db.yml
- name: Unset bootstrap args fact
set_fact:
ovn_nb_db_bootstrap_args:
ovn_sb_db_bootstrap_args:
any_errors_fatal: true

View File

@ -1,4 +1,14 @@
--- ---
- include_tasks: lookup_cluster.yml
when:
- inventory_hostname in groups['ovn-nb-db'] or
inventory_hostname in groups['ovn-sb-db']
- include_tasks: bootstrap-initial.yml
when:
- inventory_hostname in groups.get('ovn-nb-db_had_volume_False', '') or
inventory_hostname in groups.get('ovn-sb-db_had_volume_False', '')
- import_tasks: config.yml - import_tasks: config.yml
- import_tasks: check-containers.yml - import_tasks: check-containers.yml

View File

@ -0,0 +1,131 @@
---
- name: Checking for any existing OVN DB container volumes
become: true
kolla_container_volume_facts:
container_engine: "{{ kolla_container_engine }}"
name:
- ovn_nb_db
- ovn_sb_db
register: ovn_db_container_volume_facts
- name: Divide hosts by their OVN NB volume availability
group_by:
key: "ovn-nb-db_had_volume_{{ ovn_db_container_volume_facts['ovn_nb_db'] is defined }}"
changed_when: false
- name: Divide hosts by their OVN SB volume availability
group_by:
key: "ovn-sb-db_had_volume_{{ ovn_db_container_volume_facts['ovn_sb_db'] is defined }}"
changed_when: false
- name: Establish whether the OVN NB cluster has already existed
set_fact:
ovn_nb_db_cluster_exists: "{{ groups['ovn-nb-db' + '_had_volume_True'] is defined }}"
- name: Establish whether the OVN SB cluster has already existed
set_fact:
ovn_sb_db_cluster_exists: "{{ groups['ovn-sb-db' + '_had_volume_True'] is defined }}"
- name: OVN NB checks
block:
- name: Check if running on all OVN NB DB hosts
fail:
msg: >
Some hosts ({{ groups['ovn-nb-db'] | join(', ') }}) need database
bootstrapping, but not all OVN NB DB hosts are in the target
list. Stopping as it may be unsafe to proceed. Please run without --limit
or --serial to bootstrap these hosts.
when:
- ovn_nb_db_cluster_exists
- groups['ovn-nb-db'] | difference(ansible_play_batch) | list | length > 0
- name: Check OVN NB service port liveness
wait_for:
host: "{{ api_interface_address }}"
port: "{{ ovn_nb_db_port }}"
connect_timeout: 1
timeout: 10
register: check_ovn_nb_db_port_liveness
ignore_errors: yes
- name: Divide hosts by their OVN NB service port liveness
group_by:
key: "ovn-nb-db_port_alive_{{ check_ovn_nb_db_port_liveness is success }}"
changed_when: false
- name: Get OVN NB database information
command: >
{{ kolla_container_engine }} exec ovn_nb_db ovsdb-client query unix:/run/ovn/ovnnb_db.sock
"[\"_Server\",{\"table\":\"Database\",\"where\":[[\"name\",\"==\", \"OVN_Northbound\"]],\"op\":\"select\"}]"
become: true
when: check_ovn_nb_db_port_liveness is success
changed_when: false
register: ovn_nb_db_info
- name: Divide hosts by their OVN NB leader/follower role
group_by:
key: "ovn-nb-db_{{ 'leader' if (ovn_nb_db_info.stdout | from_json).0.rows.0.leader else 'follower' }}"
when: check_ovn_nb_db_port_liveness is success
changed_when: false
- name: Fail on existing OVN NB cluster with no leader
fail:
msg: OVN NB cluster exists but there is no leader - please check cluster status
when:
- groups['ovn-nb-db_leader'] is not defined and groups['ovn-nb-db_follower'] is defined
any_errors_fatal: true
when: inventory_hostname in groups.get('ovn-nb-db_had_volume_True', '')
- name: OVN SB checks
block:
- name: Check if running on all OVN SB DB hosts
fail:
msg: >
Some hosts ({{ groups['ovn-sb-db'] | join(', ') }}) need database
bootstrapping, but not all OVN SB DB hosts are in the target
list. Stopping as it may be unsafe to proceed. Please run without --limit
or --serial to bootstrap these hosts.
when:
- ovn_sb_db_cluster_exists
- groups['ovn-sb-db'] | difference(ansible_play_batch) | list | length > 0
- name: Check OVN SB service port liveness
wait_for:
host: "{{ api_interface_address }}"
port: "{{ ovn_sb_db_port }}"
connect_timeout: 1
timeout: 10
register: check_ovn_sb_db_port_liveness
ignore_errors: yes
- name: Divide hosts by their OVN SB service port liveness
group_by:
key: "ovn-sb-db_port_alive_{{ check_ovn_sb_db_port_liveness is success }}"
changed_when: false
- name: Get OVN SB database information
command: >
{{ kolla_container_engine }} exec ovn_sb_db ovsdb-client query unix:/run/ovn/ovnsb_db.sock
"[\"_Server\",{\"table\":\"Database\",\"where\":[[\"name\",\"==\", \"OVN_Southbound\"]],\"op\":\"select\"}]"
become: true
when: check_ovn_sb_db_port_liveness is success
changed_when: false
register: ovn_sb_db_info
- name: Divide hosts by their OVN SB leader/follower role
group_by:
key: "ovn-sb-db_{{ 'leader' if (ovn_sb_db_info.stdout | from_json).0.rows.0.leader else 'follower' }}"
when: check_ovn_sb_db_port_liveness is success
changed_when: false
- name: Fail on existing OVN SB cluster with no leader
fail:
msg: OVN SB cluster exists but there is no leader - please check cluster status.
when:
- groups['ovn-sb-db_leader'] is not defined and groups['ovn-sb-db_follower'] is defined
any_errors_fatal: true
when: inventory_hostname in groups.get('ovn-sb-db_had_volume_True', '')

View File

@ -1,5 +1,5 @@
{ {
"command": "/usr/share/ovn/scripts/ovn-ctl run_nb_ovsdb --db-nb-addr={{ api_interface_address | put_address_in_context('url') }} --db-nb-cluster-local-addr={{ api_interface_address | put_address_in_context('url') }} {% if groups['ovn-nb-db'] | length > 1 and inventory_hostname != groups['ovn-nb-db'][0] %} --db-nb-cluster-remote-addr={{ 'api' | kolla_address(groups['ovn-nb-db'][0]) | put_address_in_context('url') }} {% endif %} --db-nb-sock=/run/ovn/ovnnb_db.sock --db-nb-pidfile=/run/ovn/ovnnb_db.pid --db-nb-file=/var/lib/openvswitch/ovn-nb/ovnnb.db --ovn-nb-logfile=/var/log/kolla/openvswitch/ovn-nb-db.log", "command": "{{ ovn_nb_command }}",
"permissions": [ "permissions": [
{ {
"path": "/var/log/kolla/openvswitch", "path": "/var/log/kolla/openvswitch",

View File

@ -1,5 +1,5 @@
{ {
"command": "/usr/share/ovn/scripts/ovn-ctl run_sb_ovsdb --db-sb-addr={{ api_interface_address | put_address_in_context('url') }} --db-sb-cluster-local-addr={{ api_interface_address | put_address_in_context('url') }} {% if groups['ovn-sb-db'] | length > 1 and inventory_hostname != groups['ovn-sb-db'][0] %} --db-sb-cluster-remote-addr={{ 'api' | kolla_address(groups['ovn-sb-db'][0]) | put_address_in_context('url') }} {% endif %} --db-sb-sock=/run/ovn/ovnsb_db.sock --db-sb-pidfile=/run/ovn/ovnsb_db.pid --db-sb-file=/var/lib/openvswitch/ovn-sb/ovnsb.db --ovn-sb-logfile=/var/log/kolla/openvswitch/ovn-sb-db.log", "command": "{{ ovn_sb_command }}",
"permissions": [ "permissions": [
{ {
"path": "/var/log/kolla/openvswitch", "path": "/var/log/kolla/openvswitch",

View File

@ -0,0 +1,5 @@
---
fixes:
- |
Fixes issues with OVN NB/SB DB deployment, where first node needs to be
rebootstrapped. `LP#1875223 <https://launchpad.net/bugs/1875223>`__