Improve bootstrap failure recovery in replay

Previously bootstrap playbook roles were mainly triggered by config
changes during replay. Consequently, the playbook was unable to
recover from the previous failure caused by an issue other than
misconfigurations in the host override e.g.  bad image/template,
backend code flaw, network glitch, proxy server down, user
interruption, etc…

Furthermore, depending on what step the last failure occurred,
subsequent replay would fail on non-reentrant tasks such as
filesystem resizing, ip addr add/delete, sysinv REST calls.

This commit addresses these flaws by maximizing the reentrancy
of bootstrap tasks and removing the restriction of roles inclusion
based on config changes.

Tests:
  - Bootstrap a simplex system locally
  - Bootstrap a standard system remotely
  - Install and reinstall ssl ca cert via bootstrap replay
  - Induce Kubernetes services bringup failures due to misconfiguration,
    bad template file, change config and replay.
  - Induce initial database population failure due to misconfiguration,
    change config and replay.
  - Induce database update failure due to misconfiguration, change
    config and replay.
  - Induce random failures, make no config change and replay.

Known limitation:
  - Failure during the apply of bootstrap manifests may not be
    recoverable as most of these manifests are not re-entrant.

Closes-Bug: 1830781
Change-Id: Ia2c1e1199f2c67033fb91a7e9f24d808e6fe94c9
Signed-off-by: Tee Ngo <tee.ngo@windriver.com>
This commit is contained in:
Tee Ngo 2019-07-02 17:40:15 -04:00
parent c2f3cbe49d
commit a2f684c8fb
13 changed files with 603 additions and 394 deletions

View File

@ -1,3 +1,4 @@
---
timezone: Japan
cluster_host_subnet: 192.168.207.0/24
external_oam_floating_address: 172.16.0.17

View File

@ -1,3 +1,4 @@
---
system_mode: duplex
management_subnet: 192.168.204.0/24
external_oam_floating_address: 172.16.0.144

View File

@ -1,3 +1,4 @@
---
# The following are configuration parameters that should be stored
# in secrets files using Ansible vault.
#

View File

@ -1,3 +1,4 @@
---
# Configuration parameters in this file will apply to all
# hosts in the hosts file unless they are overwritten in the
# inventory_hostname.yml file or at the command line.

View File

@ -33,8 +33,8 @@
- { role: validate-config, when: not skip_play, become: yes }
- { role: store-passwd, when: not skip_play and save_password, become: yes }
- { role: apply-bootstrap-manifest, when: not skip_play and not replayed, become: yes }
- { role: persist-config, when: not skip_play and save_config, become: yes }
- { role: bringup-essential-services, when: not skip_play and save_config, become: yes }
- { role: persist-config, when: not skip_play, become: yes }
- { role: bringup-essential-services, when: not skip_play, become: yes }
vars:
change_password: false

View File

@ -13,6 +13,8 @@
- name: Add loopback interface
# Use shell instead of command module as source is an internal shell command
shell: "{{ item }}"
register: add_addresses
failed_when: false
with_items:
- source /etc/platform/openrc; system host-if-add controller-0 lo virtual none lo -c platform -m 1500
- source /etc/platform/openrc; system interface-network-assign controller-0 lo mgmt
@ -24,9 +26,15 @@
- ip addr add {{ mgmt_nfs_1_virtual }} dev lo scope host
- ip addr add {{ mgmt_nfs_2_virtual }} dev lo scope host
- name: Fail if adding interface addresses failed for reason other than it has been done before
fail:
msg: "{{ item.item }} failed for reason: {{ item.stderr }}."
when: item.rc != 0 and not incomplete_bootstrap
with_items: "{{ add_addresses.results }}"
- name: Remove previous management floating address if management network config has changed
command: ip addr delete {{ prev_mgmt_floating_virtual }} dev lo scope host
when: reconfigure_endpoints and
when: last_config_file_exists and reconfigure_endpoints and
(mgmt_floating_virtual != prev_mgmt_floating_virtual)
- name: Refresh local DNS (i.e. /etc/hosts)
@ -114,13 +122,12 @@
when: (not replayed) or (restart_services)
- block:
- name: Remove config file from previous play
- name: Remove config file from previous play
file:
path: "{{ last_bootstrap_config_file }}"
state: absent
- name: Save the current system and network config for reference in subsequent replays
- name: Save the current system and network config for reference in subsequent replays
lineinfile:
# This file should be cleared upon host reboot
path: "{{ last_bootstrap_config_file }}"
@ -142,9 +149,7 @@
- "prev_dns_servers: {{ dns_servers | join(',') }}"
- "prev_docker_http_proxy: {{ docker_http_proxy }}"
- "prev_docker_https_proxy: {{ docker_https_proxy }}"
- "prev_docker_no_proxy: {{ docker_no_proxy | join(',') }}"
- "prev_admin_username: {{ username | hash('sha1') }}"
- "prev_admin_password: {{ password | hash('sha1') }}"
- "prev_docker_no_proxy: {{ docker_no_proxy | sort | join(',') }}"
# Store the addresses as values determined in prepare-env stage not as merged values in
# validate-config stage as the latter requires subnet validation.
- "prev_pxeboot_start_address: {{ pxeboot_start_address }}"
@ -164,16 +169,19 @@
- "prev_external_oam_node_0_address: {{ external_oam_node_0_address }}"
- "prev_external_oam_node_1_address: {{ external_oam_node_1_address }}"
# Need to save the dictionary this way for proper comparison during replay
- name: Save previous docker registries header
# Need to save the dictionary this way for proper comparison during replay
- name: Save previous docker registries header
lineinfile:
path: "{{ last_bootstrap_config_file }}"
line: "prev_docker_registries:"
- name: Save previous docker registries content
- name: Save previous docker registries content
lineinfile:
path: "{{ last_bootstrap_config_file }}"
line: " {{ item.key }}: {{ item.value }}"
with_dict: "{{ docker_registries }}"
when: save_config
- name: Mark the bootstrap as completed
file:
path: "{{ bootstrap_completed_flag }}"
state: touch

View File

@ -38,9 +38,15 @@ RECONFIGURE_SYSTEM = False
RECONFIGURE_NETWORK = False
RECONFIGURE_SERVICE = False
INITIAL_POPULATION = True
INCOMPLETE_BOOTSTRAP = False
CONF = ConfigParser()
def touch(fname):
with open(fname, 'a'):
os.utime(fname, None)
def wait_system_config(client):
for _ in range(constants.SYSTEM_CONFIG_TIMEOUT):
try:
@ -59,6 +65,7 @@ def wait_system_config(client):
def populate_system_config(client):
if not INITIAL_POPULATION and not RECONFIGURE_SYSTEM:
return
# Wait for pre-populated system
system = wait_system_config(client)
@ -87,17 +94,78 @@ def populate_system_config(client):
)
patch = sysinv.dict_to_patch(values)
try:
client.sysinv.isystem.update(system.uuid, patch)
except Exception as e:
if INCOMPLETE_BOOTSTRAP:
# The previous bootstrap might have been interrupted while
# it was in the middle of persisting the initial system
# config.
isystem = client.sysinv.isystem.list()[0]
print("System type is %s" % isystem.system_type)
if isystem.system_type != "None":
# System update in previous play went through
pass
else:
raise e
else:
raise e
print("System config completed.")
def populate_load_config(client):
if not INITIAL_POPULATION:
return
print("Populating load config...")
patch = {'software_version': CONF.get('BOOTSTRAP_CONFIG', 'SW_VERSION'),
'compatible_version': "N/A",
'required_patches': "N/A"}
try:
client.sysinv.load.create(**patch)
except Exception as e:
if INCOMPLETE_BOOTSTRAP:
loads = client.sysinv.load.list()
if len(loads) > 0:
# Load config in previous play went through
pass
else:
raise e
else:
raise e
print("Load config completed.")
def create_addrpool(client, addrpool_data, network_name):
try:
pool = client.sysinv.address_pool.create(**addrpool_data)
return pool
except Exception as e:
if INCOMPLETE_BOOTSTRAP:
# The previous bootstrap might have been interrupted while
# it was in the middle of persisting this network config data
# and the controller host has not been created.
pools = client.sysinv.address_pool.list()
if pools:
for pool in pools:
if network_name in pool.name:
return pool
raise e
def create_network(client, network_data, network_name):
try:
client.sysinv.network.create(**network_data)
except Exception as e:
if INCOMPLETE_BOOTSTRAP:
# The previous bootstrap might have been interrupted while
# it was in the middle of persisting this network config data
# and the controller host has not been created.
networks = client.sysinv.network.list()
for network in networks:
if network.name == network_name:
return
raise e
def delete_network_and_addrpool(client, network_name):
@ -113,7 +181,6 @@ def delete_network_and_addrpool(client, network_name):
host = client.sysinv.ihost.get('controller-0')
host_addresses = client.sysinv.address.list_by_host(host.uuid)
for addr in host_addresses:
print("Deleting address %s" % addr.uuid)
client.sysinv.address.delete(addr.uuid)
client.sysinv.network.delete(network_uuid)
client.sysinv.address_pool.delete(addrpool_uuid)
@ -128,9 +195,10 @@ def populate_mgmt_network(client):
'MANAGEMENT_END_ADDRESS')
dynamic_allocation = CONF.getboolean(
'BOOTSTRAP_CONFIG', 'DYNAMIC_ADDRESS_ALLOCATION')
network_name = 'mgmt'
if RECONFIGURE_NETWORK:
delete_network_and_addrpool(client, 'mgmt')
delete_network_and_addrpool(client, network_name)
print("Updating management network...")
else:
print("Populating management network...")
@ -142,7 +210,7 @@ def populate_mgmt_network(client):
'prefix': management_subnet.prefixlen,
'ranges': [(start_address, end_address)],
}
pool = client.sysinv.address_pool.create(**values)
pool = create_addrpool(client, values, 'management')
# create the network for the pool
values = {
@ -151,8 +219,7 @@ def populate_mgmt_network(client):
'dynamic': dynamic_allocation,
'pool_uuid': pool.uuid,
}
client.sysinv.network.create(**values)
create_network(client, values, network_name)
def populate_pxeboot_network(client):
@ -161,9 +228,10 @@ def populate_pxeboot_network(client):
'PXEBOOT_START_ADDRESS')
end_address = CONF.get('BOOTSTRAP_CONFIG',
'PXEBOOT_END_ADDRESS')
network_name = 'pxeboot'
if RECONFIGURE_NETWORK:
delete_network_and_addrpool(client, 'pxeboot')
delete_network_and_addrpool(client, network_name)
print("Updating pxeboot network...")
else:
print("Populating pxeboot network...")
@ -175,7 +243,7 @@ def populate_pxeboot_network(client):
'prefix': pxeboot_subnet.prefixlen,
'ranges': [(start_address, end_address)],
}
pool = client.sysinv.address_pool.create(**values)
pool = create_addrpool(client, values, network_name)
# create the network for the pool
values = {
@ -184,11 +252,7 @@ def populate_pxeboot_network(client):
'dynamic': True,
'pool_uuid': pool.uuid,
}
client.sysinv.network.create(**values)
def populate_infra_network(client):
return
create_network(client, values, network_name)
def populate_oam_network(client):
@ -198,9 +262,10 @@ def populate_oam_network(client):
'EXTERNAL_OAM_START_ADDRESS')
end_address = CONF.get('BOOTSTRAP_CONFIG',
'EXTERNAL_OAM_END_ADDRESS')
network_name = 'oam'
if RECONFIGURE_NETWORK:
delete_network_and_addrpool(client, 'oam')
delete_network_and_addrpool(client, network_name)
print("Updating oam network...")
else:
print("Populating oam network...")
@ -227,7 +292,7 @@ def populate_oam_network(client):
'gateway_address': CONF.get(
'BOOTSTRAP_CONFIG', 'EXTERNAL_OAM_GATEWAY_ADDRESS'),
})
pool = client.sysinv.address_pool.create(**values)
pool = create_addrpool(client, values, network_name)
# create the network for the pool
values = {
@ -236,8 +301,7 @@ def populate_oam_network(client):
'dynamic': False,
'pool_uuid': pool.uuid,
}
client.sysinv.network.create(**values)
create_network(client, values, network_name)
def populate_multicast_network(client):
@ -247,9 +311,10 @@ def populate_multicast_network(client):
'MANAGEMENT_MULTICAST_START_ADDRESS')
end_address = CONF.get('BOOTSTRAP_CONFIG',
'MANAGEMENT_MULTICAST_END_ADDRESS')
network_name = 'multicast'
if RECONFIGURE_NETWORK:
delete_network_and_addrpool(client, 'multicast')
delete_network_and_addrpool(client, network_name)
print("Updating multicast network...")
else:
print("Populating multicast network...")
@ -261,7 +326,7 @@ def populate_multicast_network(client):
'prefix': management_multicast_subnet.prefixlen,
'ranges': [(start_address, end_address)],
}
pool = client.sysinv.address_pool.create(**values)
pool = create_addrpool(client, values, network_name)
# create the network for the pool
values = {
@ -270,7 +335,7 @@ def populate_multicast_network(client):
'dynamic': False,
'pool_uuid': pool.uuid,
}
client.sysinv.network.create(**values)
create_network(client, values, network_name)
def populate_cluster_host_network(client):
@ -280,9 +345,10 @@ def populate_cluster_host_network(client):
'CLUSTER_HOST_START_ADDRESS')
end_address = CONF.get('BOOTSTRAP_CONFIG',
'CLUSTER_HOST_END_ADDRESS')
network_name = 'cluster-host'
if RECONFIGURE_NETWORK:
delete_network_and_addrpool(client, 'cluster-host')
delete_network_and_addrpool(client, network_name)
print("Updating cluster host network...")
else:
print("Populating cluster host network...")
@ -294,7 +360,7 @@ def populate_cluster_host_network(client):
'prefix': cluster_host_subnet.prefixlen,
'ranges': [(start_address, end_address)],
}
pool = client.sysinv.address_pool.create(**values)
pool = create_addrpool(client, values, network_name)
# create the network for the pool
values = {
@ -303,7 +369,7 @@ def populate_cluster_host_network(client):
'dynamic': True,
'pool_uuid': pool.uuid,
}
client.sysinv.network.create(**values)
create_network(client, values, network_name)
def populate_cluster_pod_network(client):
@ -313,9 +379,10 @@ def populate_cluster_pod_network(client):
'CLUSTER_POD_START_ADDRESS')
end_address = CONF.get('BOOTSTRAP_CONFIG',
'CLUSTER_POD_END_ADDRESS')
network_name = 'cluster-pod'
if RECONFIGURE_NETWORK:
delete_network_and_addrpool(client, 'cluster-pod')
delete_network_and_addrpool(client, network_name)
print("Updating cluster pod network...")
else:
print("Populating cluster pod network...")
@ -327,7 +394,7 @@ def populate_cluster_pod_network(client):
'prefix': cluster_pod_subnet.prefixlen,
'ranges': [(start_address, end_address)],
}
pool = client.sysinv.address_pool.create(**values)
pool = create_addrpool(client, values, network_name)
# create the network for the pool
values = {
@ -336,7 +403,7 @@ def populate_cluster_pod_network(client):
'dynamic': False,
'pool_uuid': pool.uuid,
}
client.sysinv.network.create(**values)
create_network(client, values, network_name)
def populate_cluster_service_network(client):
@ -346,9 +413,10 @@ def populate_cluster_service_network(client):
'CLUSTER_SERVICE_START_ADDRESS')
end_address = CONF.get('BOOTSTRAP_CONFIG',
'CLUSTER_SERVICE_END_ADDRESS')
network_name = 'cluster-service'
if RECONFIGURE_NETWORK:
delete_network_and_addrpool(client, 'cluster-service')
delete_network_and_addrpool(client, network_name)
print("Updating cluster service network...")
else:
print("Populating cluster service network...")
@ -360,7 +428,7 @@ def populate_cluster_service_network(client):
'prefix': cluster_service_subnet.prefixlen,
'ranges': [(start_address, end_address)],
}
pool = client.sysinv.address_pool.create(**values)
pool = create_addrpool(client, values, network_name)
# create the network for the pool
values = {
@ -369,7 +437,7 @@ def populate_cluster_service_network(client):
'dynamic': False,
'pool_uuid': pool.uuid,
}
client.sysinv.network.create(**values)
create_network(client, values, network_name)
def populate_network_config(client):
@ -377,7 +445,6 @@ def populate_network_config(client):
return
populate_mgmt_network(client)
populate_pxeboot_network(client)
populate_infra_network(client)
populate_oam_network(client)
populate_multicast_network(client)
populate_cluster_host_network(client)
@ -390,11 +457,7 @@ def populate_dns_config(client):
if not INITIAL_POPULATION and not RECONFIGURE_SYSTEM:
return
if INITIAL_POPULATION:
print("Populating DNS config...")
else:
print("Updating DNS config...")
print("Populating/Updating DNS config...")
nameservers = CONF.get('BOOTSTRAP_CONFIG', 'NAMESERVERS')
dns_list = client.sysinv.idns.list()
@ -405,21 +468,26 @@ def populate_dns_config(client):
}
patch = sysinv.dict_to_patch(values)
client.sysinv.idns.update(dns_record.uuid, patch)
print("DNS config completed.")
def populate_docker_config(client):
if not INITIAL_POPULATION and not RECONFIGURE_SERVICE:
return
if INITIAL_POPULATION:
print("Populating docker config...")
else:
print("Updating docker config...")
http_proxy = CONF.get('BOOTSTRAP_CONFIG', 'DOCKER_HTTP_PROXY')
https_proxy = CONF.get('BOOTSTRAP_CONFIG', 'DOCKER_HTTPS_PROXY')
no_proxy = CONF.get('BOOTSTRAP_CONFIG', 'DOCKER_NO_PROXY')
# Get rid of the faulty docker proxy entries that might have
# been created in the previous failed run.
parameters = client.sysinv.service_parameter.list()
for parameter in parameters:
if (parameter.name == 'http_proxy' or
parameter.name == 'https_proxy' or
parameter.name == 'no_proxy'):
client.sysinv.service_parameter.delete(parameter.uuid)
if http_proxy != 'undef' or https_proxy != 'undef':
parameters = {}
if http_proxy != 'undef':
@ -435,27 +503,30 @@ def populate_docker_config(client):
'resource': None,
'parameters': parameters
}
if RECONFIGURE_SERVICE:
parameters = client.sysinv.service_parameter.list()
for parameter in parameters:
if (parameter.name == 'http_proxy' or
parameter.name == 'https_proxy' or
parameter.name == 'no_proxy'):
client.sysinv.service_parameter.delete(parameter.uuid)
print("Populating/Updating docker proxy config...")
client.sysinv.service_parameter.create(**values)
print("Docker proxy config completed.")
use_default_registries = CONF.getboolean(
'BOOTSTRAP_CONFIG', 'USE_DEFAULT_REGISTRIES')
# Get rid of any faulty docker registry entries that might have been
# created in the previous failed run.
parameters = client.sysinv.service_parameter.list()
for parameter in parameters:
if (parameter.name == 'k8s' or
parameter.name == 'gcr' or
parameter.name == 'quay' or
parameter.name == 'docker' or
parameter.name == 'insecure_registry'):
client.sysinv.service_parameter.delete(parameter.uuid)
if not use_default_registries:
secure_registry = CONF.getboolean('BOOTSTRAP_CONFIG',
'IS_SECURE_REGISTRY')
parameters = {}
# TODO(tngo): The following 4 service parameters will be removed when
# we switch to the long term solution using a single "registries"
# service parameter that is extensible.
parameters['k8s'] = CONF.get('BOOTSTRAP_CONFIG', 'K8S_REGISTRY')
parameters['gcr'] = CONF.get('BOOTSTRAP_CONFIG', 'GCR_REGISTRY')
parameters['quay'] = CONF.get('BOOTSTRAP_CONFIG', 'QUAY_REGISTRY')
@ -471,16 +542,8 @@ def populate_docker_config(client):
'resource': None,
'parameters': parameters
}
if RECONFIGURE_SERVICE:
parameters = client.sysinv.service_parameter.list()
for parameter in parameters:
if (parameter.name == 'k8s' or
parameter.name == 'gcr' or
parameter.name == 'quay' or
parameter.name == 'docker' or
parameter.name == 'insecure_registry'):
client.sysinv.service_parameter.delete(
parameter.uuid)
print("Populating/Updating docker registry config...")
client.sysinv.service_parameter.create(**values)
print("Docker registry config completed.")
@ -668,7 +731,20 @@ def populate_controller_config(client):
'install_output': install_output,
}
print("Host values = %s" % values)
try:
controller = client.sysinv.ihost.create(**values)
except Exception as e:
if INCOMPLETE_BOOTSTRAP:
# The previous bootstrap might have been interrupted while
# it was in the middle of creating the controller-0 host.
controller = client.sysinv.ihost.get('controller-0')
if controller:
pass
else:
raise e
else:
raise e
print("Host controller-0 created.")
return controller
@ -729,7 +805,19 @@ def populate_default_storage_backend(client, controller):
print("Populating ceph storage backend config...")
values = {'confirmed': True}
try:
client.sysinv.storage_ceph.create(**values)
except Exception as e:
if INCOMPLETE_BOOTSTRAP:
storage_backends = client.sysinv.storage_backend.list()
for storage_backend in storage_backends:
if storage_backend.name == "ceph-store":
break
else:
raise e
else:
raise e
print("Default storage backend provisioning completed.")
def handle_invalid_input():
@ -764,6 +852,8 @@ if __name__ == '__main__':
raise Exception("Config file is not found!")
CONF.read(config_file)
INCOMPLETE_BOOTSTRAP = CONF.getboolean('BOOTSTRAP_CONFIG',
'INCOMPLETE_BOOTSTRAP')
# Puppet manifest might be applied as part of initial host
# config, set INITIAL_CONFIG_PRIMARY variable just in case.

View File

@ -11,6 +11,12 @@
# Keyring config
- block:
- name: Check if keyring data has been persisted
stat:
path: "{{ keyring_workdir }}"
register: tmp_keyring
- block:
- name: Delete the previous python_keyring directory if exists
file:
path: "{{ keyring_permdir + '/' + keyring_workdir | basename }}"
@ -18,6 +24,7 @@
- name: Persist keyring data
command: "mv {{ keyring_workdir }} {{ keyring_permdir }}"
when: tmp_keyring.stat.exists
when: save_password
- name: Ensure replicated config parent directory exists
@ -35,93 +42,42 @@
file_type: any
register: config_find
- name: Remove existing config files from permanent location
- block:
- name: Remove existing config files from permanent location
file:
path: "{{ config_permdir }}/{{ item.path | basename}}"
state: absent
with_items: "{{ config_find.files }}"
- name: Move new config files to permanent location
- name: Move new config files to permanent location
# Can't use command module due to wildcard
shell: mv {{ config_workdir }}/* {{ config_permdir }}
- name: Delete working config directory
- name: Delete working config directory
file:
path: "{{ config_workdir }}"
state: absent
when: config_find.matched != 0
# Postgres, PXE, Branding, Grub config tasks and filesystem resizing are
# moved to a separate file as they don't need to be executed again on replay.
# moved to a separate file as they don't need to be executed again once the
# controller-0 host has been created.
- include: one_time_config_tasks.yml
when: not reconfigured
when: not initial_db_populated
- block:
- name: Set input parameters to populate config script
set_fact:
script_input: "{{ config_permdir + '/' + bootstrap_config_file|basename }}"
- name: Update input parameters with reconfigure system flag
set_fact:
script_input: "{{ script_input + ' --system' }}"
when: system_config_update
- name: Update input parameters with reconfigure network flag
set_fact:
script_input: "{{ script_input + ' --network' }}"
when: network_config_update
- name: Update input parameters with reconfigure service flag
set_fact:
script_input: "{{ script_input + ' --service' }}"
when: docker_config_update
- name: Update input parameters if config from previous play is missing
set_fact:
script_input: "{{ script_input + ' --system --network --service' }}"
when: reconfigured and not last_config_file.stat.exists
- debug: var=script_input
- name: Remove the endpoint reconfig flag before reconfiguring the service endpoints
file:
path: /etc/platform/.service_endpoint_reconfigured
state: absent
when: reconfigure_endpoints
- name: Shuting down services for reconfiguration
# Shut down services if there are services impacting config changes in
# this replay or previous bootstrap did not complete for whatever reason.
- name: Shuting down services for reconfiguration as required
include: shutdown_services.yml
when: restart_services
- name: Saving config in sysinv database
script: populate_initial_config.py {{ script_input }}
register: populate_result
failed_when: false
- include: update_sysinv_database.yml
when: save_config_to_db
- debug: var=populate_result
- name: Fail if populate config script throws an exception
fail:
msg: "Failed to provision initial system configuration."
when: populate_result.rc != 0
- block:
- name: Add management floating address if this is the initial play
command: ip addr add {{ mgmt_floating_virtual }} dev lo scope host
when: not replayed
# If this is initial play or replay with management and/or oam network config change, must
# wait for the keystone endpoint runtime manifest to complete and restart
# sysinv agent and api.
- name: Wait for service endpoints reconfiguration to complete
wait_for:
path: /etc/platform/.service_endpoint_reconfigured
state: present
timeout: 360
msg: Timeout waiting for service endpoints reconfiguration to complete
when: not replayed or reconfigure_endpoints
- block:
# Update docker config file and restart docker if docker proxy is
# configured
- block:
- name: Ensure docker config directory exists
file:
path: /etc/systemd/system/docker.service.d
@ -170,8 +126,8 @@
when: use_docker_proxy
when: save_config
# Install certificate if SSL CA certifcate is configured
- block:
- name: Copy ssl_ca certificate
copy:

View File

@ -50,11 +50,7 @@
owner: postgres
group: postgres
remote_src: yes
# with_fileglob can only be used for local lookup
# with_fileglob:
# - /etc/postgresql/*
with_items:
- "{{ postgres_result.files }}"
with_items: "{{ postgres_result.files }}"
- name: Create a symlink to PXE config files
file:
@ -105,6 +101,8 @@
- name: Resize filesystems (default)
command: "{{ item }}"
register: resize_result
failed_when: false
with_items:
- lvextend -L20G /dev/cgts-vg/pgsql-lv
- lvextend -L10G /dev/cgts-vg/cgcs-lv
@ -115,8 +113,19 @@
- resize2fs /dev/drbd3
- resize2fs /dev/drbd8
- name: Further resize if root disk size is larger than 240G
- name: Fail if file system resizing failed for a reason other than it has been done already
fail:
msg: "{{ item.item }} failed for the following reason: {{ item.stderr }}."
when: item.rc !=0 and item.stderr is not search('matches existing size') and
item.stderr is not search('not larger than existing size')
with_items: "{{ resize_result.results }}"
- block:
- name: Further resize if root disk size is larger than 240G
command: "{{ item }}"
register: resize_result
failed_when: false
with_items:
- lvextend -L40G /dev/cgts-vg/pgsql-lv
- lvextend -L20G /dev/cgts-vg/cgcs-lv
@ -124,4 +133,11 @@
- drbdadm -- --assume-peer-has-space resize all
- resize2fs /dev/drbd0
- resize2fs /dev/drbd3
- name: Fail if additional resizing failed for a reason other than it has been done already
fail:
msg: "{{ item.item }} failed for the following reason: {{ item.stderr }}."
when: item.rc !=0 and item.stderr is not search('matches existing size') and
item.stderr is not search('Nothing to do!')
with_items: "{{ resize_result.results }}"
when: root_disk_size|int > minimum_root_disk_size

View File

@ -33,13 +33,20 @@
args:
warn: false
- block:
- name: Restart etcd
systemd:
name: etcd
state: restarted
# Revert configuration to loopback interface
- block:
rescue:
- name: Etcd failed to restart, try one more time
systemd:
name: etcd
state: restarted
- block: # Revert configuration to loopback interface
- name: Set facts derived from previous network configurations
set_fact:
prev_management_subnet_prefix: "{{ prev_management_subnet | ipaddr('prefix') }}"
@ -75,6 +82,8 @@
# are reconfigured.
- name: Remove loopback interface in sysinv db and associated addresses
shell: "{{ item }}"
register: remove_addresses
failed_when: false
with_items:
- source /etc/platform/openrc; system host-if-delete controller-0 lo
- "ip addr delete {{ prev_mgmt_nfs_2_virtual }} dev lo scope host"
@ -84,11 +93,21 @@
- "ip addr delete {{ prev_mgmt_virtual }} brd {{ management_broadcast }} dev lo:1 scope host"
- "ip addr delete {{ prev_cluster_virtual }} brd {{ cluster_broadcast }} dev lo:5 scope host"
- name: Fail if removing interface addresses failed for reason other than it has been done already
fail:
msg: "{{ item.item }} failed for reason: {{ item.stderr }}."
when: item.rc != 0 and not incomplete_bootstrap
with_items: "{{ remove_addresses.results }}"
when: last_config_file_exists
- block:
# Enable the new management floating address so that sysinv-api is reachable at this IP when
# service endpoints have been reconfigured and sysinv-api restarted.
- name: Add the new management address for service endpoints reconfiguration
command: ip addr add {{ mgmt_floating_virtual }} dev lo scope host
register: add_mgmt_address
failed_when: add_mgmt_address.rc != 0 and not incomplete_bootstrap
when: mgmt_floating_virtual != prev_mgmt_floating_virtual
when: reconfigure_endpoints

View File

@ -0,0 +1,84 @@
---
#
# Copyright (c) 2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# SUB-TASKS DESCRIPTION:
# Persist new configuration data in sysinv database.
#
- name: Set input parameters to populate config script
set_fact:
script_input: "{{ config_permdir + '/' + bootstrap_config_file|basename }}"
- block:
- block: # executed if config output file exists
- name: Update input parameters with reconfigure system flag
set_fact:
script_input: "{{ script_input + ' --system' }}"
when: system_config_update
- name: Update input parameters with reconfigure network flag
set_fact:
script_input: "{{ script_input + ' --network' }}"
when: network_config_update
- name: Update input parameters with reconfigure service flag
set_fact:
script_input: "{{ script_input + ' --service' }}"
# It is possible that the services failed to be brought up due to previous
# docker configuration and that setting has not been written to the output
# file.
when: docker_config_update or incomplete_bootstrap
when: last_config_file_exists
- name: Update input parameters if config from previous play has not been generated or is missing
set_fact:
script_input: "{{ script_input + ' --system --network --service' }}"
reconfigure_endpoints: true
when: not last_config_file_exists
when: initial_db_populated
- debug: var=script_input
- block:
- name: Remove the endpoint reconfig flag before reconfiguring the service endpoints
file:
path: /etc/platform/.service_endpoint_reconfigured
state: absent
when: reconfigure_endpoints
# Make sure the management floating address is available
- name: Add the management floating address
command: ip addr add {{ mgmt_floating_virtual }} dev lo scope host
register: add_mgmt_address
failed_when: add_mgmt_address.rc != 0 and not incomplete_bootstrap
when: (not replayed) or (not initial_db_populated) or (reconfigure_endpoints)
- name: Saving config in sysinv database
script: populate_initial_config.py {{ script_input }}
register: populate_result
failed_when: false
- debug: var=populate_result
- name: Fail if populate config script throws an exception
fail:
msg: "Failed to provision initial system configuration."
when: populate_result.rc != 0
# If this is initial play or replay with management and/or oam network
# config change, must wait for the keystone endpoint runtime manifest
# to complete and restart sysinv agent and api.
- name: Wait for service endpoints reconfiguration to complete
wait_for:
path: /etc/platform/.service_endpoint_reconfigured
state: present
timeout: 360
msg: Timeout waiting for service endpoints reconfiguration to complete
- name: Set flag to mark the initial db population completed milestone
file:
path: "{{ initial_db_populated_flag }}"
state: touch
when: not initial_db_populated

View File

@ -120,93 +120,6 @@
- admin_password
- override_files_dir
- name: Set initial address facts if not defined. They will be updated later
set_fact:
pxeboot_start_address: "{{ pxeboot_start_address | default('derived') }}"
pxeboot_end_address: "{{ pxeboot_end_address | default('derived') }}"
management_start_address: "{{ management_start_address | default('derived') }}"
management_end_address: "{{ management_end_address | default('derived') }}"
cluster_host_start_address: "{{ cluster_host_start_address | default('derived') }}"
cluster_host_end_address: "{{ cluster_host_end_address | default('derived') }}"
cluster_pod_start_address: "{{ cluster_pod_start_address | default('derived') }}"
cluster_pod_end_address: "{{ cluster_pod_end_address | default('derived') }}"
cluster_service_start_address: "{{ cluster_service_start_address | default('derived') }}"
cluster_service_end_address: "{{ cluster_service_end_address | default('derived') }}"
external_oam_start_address: "{{ external_oam_start_address | default('derived') }}"
external_oam_end_address: "{{ external_oam_end_address | default('derived') }}"
management_multicast_start_address: "{{ management_multicast_start_address | default('derived') }}"
management_multicast_end_address: "{{ management_multicast_end_address | default('derived') }}"
external_oam_node_0_address: "{{ external_oam_node_0_address | default('derived') }}"
external_oam_node_1_address: "{{ external_oam_node_1_address | default('derived') }}"
- name: Set default registries dictionary
set_fact:
default_docker_registries:
k8s.gcr.io: k8s.gcr.io
gcr.io: gcr.io
quay.io: quay.io
docker.io: docker.io
- name: Merge user and default registries dictionaries
set_fact:
docker_registries: "{{ default_docker_registries | combine(docker_registries) }}"
- name: Initialize some flags to be used in subsequent roles/tasks
set_fact:
reconfigured: false
system_config_update: false
network_config_update: false
docker_config_update: false
save_password: true
save_config: true
use_docker_proxy: false
use_unified_registry: false
restart_services: false
reconfigure_endpoints: false
- name: Set initial facts
set_fact:
system_params:
'system_mode': "{{ system_mode }}"
'timezone': "{{ timezone }}"
root_disk_size: "{{ standard_root_disk_size }}"
root_disk_idx: 0
localhost_name_ip_mapping: "127.0.0.1\tlocalhost\tlocalhost.localdomain localhost4 localhost4.localdomain4"
network_params:
'pxeboot_subnet': "{{ pxeboot_subnet }}"
'management_subnet': "{{ management_subnet }}"
'cluster_host_subnet': "{{ cluster_host_subnet }}"
'cluster_pod_subnet': "{{ cluster_pod_subnet }}"
'cluster_service_subnet': "{{ cluster_service_subnet }}"
'external_oam_subnet': "{{ external_oam_subnet }}"
'external_oam_gateway_address': "{{ external_oam_gateway_address }}"
'external_oam_floating_address': "{{ external_oam_floating_address }}"
'management_multicast_subnet': "{{ management_multicast_subnet }}"
# Set this placeholder here to workaround an Ansible quirk
derived_network_params:
place_holder: place_holder
ansible_remote_tmp: "{{ ansible_remote_tmp | default('/tmp/.ansible-${USER}/tmp') }}"
pods_wait_time: "{{ pods_wait_time | default(120) }}"
- name: Turn on use_docker_proxy flag
set_fact:
use_docker_proxy: true
when: (docker_http_proxy is defined and docker_http_proxy is not none) or
(docker_https_proxy is defined and docker_https_proxy is not none)
- name: Set default values for platform registries
set_fact:
default_k8s_registry: k8s.gcr.io
default_gcr_registry: gcr.io
default_quay_registry: quay.io
default_docker_registry: docker.io
- name: Set default values for docker proxies if not defined
set_fact:
docker_http_proxy: "{{ docker_http_proxy | default('undef') }}"
docker_https_proxy: "{{ docker_https_proxy | default('undef') }}"
docker_no_proxy: "{{ docker_no_proxy | default([]) }}"
- name: Retrieve software version number
# lookup module does not work with /etc/build.info as it does not have ini
# format. Resort to shell source.
@ -243,6 +156,99 @@
config_permdir: "{{ platform_path + '/config/' + software_version }}"
puppet_permdir: "{{ platform_path + '/puppet/' + software_version }}"
- name: Set initial address facts if not defined. They will be updated later
set_fact:
pxeboot_start_address: "{{ pxeboot_start_address | default('derived') }}"
pxeboot_end_address: "{{ pxeboot_end_address | default('derived') }}"
management_start_address: "{{ management_start_address | default('derived') }}"
management_end_address: "{{ management_end_address | default('derived') }}"
cluster_host_start_address: "{{ cluster_host_start_address | default('derived') }}"
cluster_host_end_address: "{{ cluster_host_end_address | default('derived') }}"
cluster_pod_start_address: "{{ cluster_pod_start_address | default('derived') }}"
cluster_pod_end_address: "{{ cluster_pod_end_address | default('derived') }}"
cluster_service_start_address: "{{ cluster_service_start_address | default('derived') }}"
cluster_service_end_address: "{{ cluster_service_end_address | default('derived') }}"
external_oam_start_address: "{{ external_oam_start_address | default('derived') }}"
external_oam_end_address: "{{ external_oam_end_address | default('derived') }}"
management_multicast_start_address: "{{ management_multicast_start_address | default('derived') }}"
management_multicast_end_address: "{{ management_multicast_end_address | default('derived') }}"
external_oam_node_0_address: "{{ external_oam_node_0_address | default('derived') }}"
external_oam_node_1_address: "{{ external_oam_node_1_address | default('derived') }}"
- name: Set default registries dictionary
set_fact:
default_docker_registries:
k8s.gcr.io: k8s.gcr.io
gcr.io: gcr.io
quay.io: quay.io
docker.io: docker.io
- name: Merge user and default registries dictionaries
set_fact:
docker_registries: "{{ default_docker_registries | combine(docker_registries) }}"
- name: Initialize some flags to be used in subsequent roles/tasks
set_fact:
system_config_update: false
network_config_update: false
docker_config_update: false
save_password: true
save_config_to_db: true
use_docker_proxy: false
use_unified_registry: false
restart_services: false
reconfigure_endpoints: false
# Replay related flags
last_config_file_exists: false
incomplete_bootstrap: false
initial_db_populated: false
- name: Set initial facts
set_fact:
system_params:
'system_mode': "{{ system_mode }}"
'timezone': "{{ timezone }}"
root_disk_size: "{{ standard_root_disk_size }}"
root_disk_idx: 0
localhost_name_ip_mapping: "127.0.0.1\tlocalhost\tlocalhost.localdomain localhost4 localhost4.localdomain4"
network_params:
'pxeboot_subnet': "{{ pxeboot_subnet }}"
'management_subnet': "{{ management_subnet }}"
'cluster_host_subnet': "{{ cluster_host_subnet }}"
'cluster_pod_subnet': "{{ cluster_pod_subnet }}"
'cluster_service_subnet': "{{ cluster_service_subnet }}"
'external_oam_subnet': "{{ external_oam_subnet }}"
'external_oam_gateway_address': "{{ external_oam_gateway_address }}"
'external_oam_floating_address': "{{ external_oam_floating_address }}"
'management_multicast_subnet': "{{ management_multicast_subnet }}"
# Set this placeholder here to workaround an Ansible quirk
derived_network_params:
place_holder: place_holder
ansible_remote_tmp: "{{ ansible_remote_tmp | default('/tmp/.ansible-${USER}/tmp') }}"
pods_wait_time: "{{ pods_wait_time | default(120) }}"
bootstrap_completed_flag: "{{ config_permdir }}/.bootstrap_completed"
initial_db_populated_flag: "{{ config_permdir }}/.initial_db_population_completed"
- name: Turn on use_docker_proxy flag
set_fact:
use_docker_proxy: true
when: (docker_http_proxy is defined and docker_http_proxy is not none) or
(docker_https_proxy is defined and docker_https_proxy is not none)
- name: Set default values for platform registries
set_fact:
default_k8s_registry: k8s.gcr.io
default_gcr_registry: gcr.io
default_quay_registry: quay.io
default_docker_registry: docker.io
- name: Set default values for docker proxies if not defined
set_fact:
docker_http_proxy: "{{ docker_http_proxy | default('undef') }}"
docker_https_proxy: "{{ docker_https_proxy | default('undef') }}"
docker_no_proxy: "{{ docker_no_proxy | default([]) }}"
# Give the bootstrap config output file on the host a generic name so the
# same file is referenced if the host is bootstrapped locally and remotely
# in whatever order.
@ -265,29 +271,61 @@
replayed: true
when: openrc_file.stat.exists and docker.rc == 0
- block:
- name: Check if the controller-0 host has been successfully provisioned
shell: source /etc/platform/openrc; system host-list|grep controller-0
failed_when: false
register: host_check
- block: # executed if it is a replay
- name: Check the overall status of the previous play
stat:
path: "{{ bootstrap_completed_flag }}"
register: bootstrap_completed
- block: # system has been configured
- name: Set flag to indicate that this host has been previously configured
- block: # executed when previous play did not complete
- name: Turn on incomplete_bootstrap flag if the previous play did not complete
set_fact:
reconfigured: true
incomplete_bootstrap: true
restart_services: true
- name: Check the initial database population status
stat:
path: "{{ initial_db_populated_flag }}"
register: initial_db_population_completed
- name: Turn on initial_db_populated and restart_services flags if initial db population did complete
set_fact:
initial_db_populated: true
when: initial_db_population_completed.stat.exists
when: not bootstrap_completed.stat.exists
- block: # executed when previous play completed
- name: Remove bootstrap_completed flag for the current play if the previous play succeeded
file:
path: "{{ bootstrap_completed_flag }}"
state: absent
become: yes
- name: Turn on initial_db_populated flag
set_fact:
initial_db_populated: true
when: not incomplete_bootstrap
# The previous play failed but the one before that did. Execute the following
# block if initial db population completed.
- block:
- name: Find previous config file for this host
stat:
path: "{{ last_bootstrap_config_file }}"
register: last_config_file
- block:
- block: # exexcuted if the last config file exists
- name: Turn on last_config_file_exists flag
set_fact:
last_config_file_exists: true
- name: Set last config file to import (local)
set_fact:
last_config: "{{ last_bootstrap_config_file }}"
when: inventory_hostname == 'localhost'
# Currently Ansible include_vars only works with local file
# Currently Ansible include_vars only works with local file.
- block:
# Give a host specific name in case the playbook is used to bootstrap
# multiple remote hosts simultaneously
@ -313,6 +351,11 @@
(prev_timezone != timezone) or
(prev_dns_servers.split(',') | sort != dns_servers | sort)
- name: Convert previous docker no proxy config value for comparison
set_fact:
prev_docker_no_proxy:
"{{ (prev_docker_no_proxy.split(',') | sort) if prev_docker_no_proxy else [] }}"
- name: Turn on docker reconfiguration flag if docker config is changed
set_fact:
docker_config_update: true
@ -320,7 +363,7 @@
((use_docker_proxy) and
(prev_docker_http_proxy != docker_http_proxy or
prev_docker_https_proxy != docker_https_proxy or
prev_docker_no_proxy != docker_no_proxy))
prev_docker_no_proxy != docker_no_proxy | sort))
- name: Turn on service endpoints reconfiguration flag if management and/or oam network config is changed
set_fact:
@ -370,37 +413,22 @@
- name: Turn off save_password flag if admin password has not changed
set_fact:
save_password: false
username: "{{ prev_admin_username }}"
password: "{{ prev_admin_password }}"
# TODO(tngo): there seems to be a puppet/sysinv limitation that prevents password
# reconfiguration to work without an extra boot. Temporarily disable
# it for replay for now.
when: prev_admin_password == admin_password|hash('sha1')
or replayed
when: replayed
# Re-evaluate condition to persist config data to sysinv database
- name: Turn off save_config flag if system, network, and docker configurations have not changed
# Re-evaluate the condition to persist config data to sysinv database
- name: Turn off save_config_to_db flag
set_fact:
save_config: false
save_config_to_db: false
when: not system_config_update and
not network_config_update and
not docker_config_update
- block:
- debug:
msg: "Configurations are unchanged. There's nothing to play!"
- name: Stop playing if this is the only target host
meta: end_play
when: play_hosts|length == 1
- name: Turn on skip_play flag
set_fact:
skip_play: true
when: not save_password and not save_config
not docker_config_update and
not incomplete_bootstrap
when: last_config_file.stat.exists
when: host_check.rc == 0
when: initial_db_populated
when: replayed # bootstrap manifest has been applied
- name: Check volume groups
@ -483,7 +511,9 @@
restart_services flag: {{ restart_services }},
endpoints_reconfiguration_flag: {{ reconfigure_endpoints }},
save_password flag: {{ save_password }},
save_config flag: {{ save_config }},
skip_play flag: {{ skip_play }}
save_config_to_db flag: {{ save_config_to_db }},
skip_play flag: {{ skip_play }},
incomplete_bootstrap flag: {{ incomplete_bootstrap }},
initial_db_populated_flag: {{ initial_db_populated }}
when: not skip_play

View File

@ -519,10 +519,12 @@
- "USE_DEFAULT_REGISTRIES={{ use_default_registries }}"
- "IS_SECURE_REGISTRY={{ is_secure_registry | default(True) }}"
- "RECONFIGURE_ENDPOINTS={{ reconfigure_endpoints }}"
- "INITIAL_DB_POPULATED={{ initial_db_populated }}"
- "INCOMPLETE_BOOTSTRAP={{ incomplete_bootstrap }}"
- name: Write simplex flag
file:
path: /etc/platform/simplex
state: touch
when: save_config
when: save_config_to_db