Integrate ha-test-suite into the repo

This commit puts the ha-test-suite under the tools directory instead of
having it downloaded from the outside (originally it was available at
the link https://github.com/rscarazz/tripleo-director-ha-test-suite.
At code level now the suite is copied directly from the local path
tools/ha-test-suite and the executable now is ha-test-suite.sh.

Change-Id: I087bc28a0afa3ede9b2fb698892b8306f56790a2
This commit is contained in:
Raoul Scarazzini 2017-05-03 12:41:23 -04:00
parent 98593f6628
commit d9e9613a8b
22 changed files with 771 additions and 13 deletions

View File

@ -17,14 +17,13 @@
"{{ working_dir }}/workarounds.sh"
when: apply_workarounds
- name: Get overcloud-ha-test-suite on undercloud and controllers
- name: Copy ha-test-suite on undercloud and controllers
shell: >
rm -rf tripleo-director-ha-test-suite;
git clone https://github.com/rscarazz/tripleo-director-ha-test-suite/ tripleo-director-ha-test-suite;
delegate_to: "{{ item }}"
/usr/bin/rsync --delay-updates -F --compress --archive -e 'ssh -F {{ local_working_dir }}/ssh.config.ansible' {{ local_working_dir }}/tripleo-quickstart-utils/tools/ha-test-suite {{ item }}:
delegate_to: "localhost"
with_items:
- "undercloud"
- "{{ groups['controller'] }}"
- "undercloud"
- name: Include test sequence depending on release
include_vars:
@ -39,7 +38,7 @@
- name: HA test - Failed actions (overcloud)
delegate_to: overcloud-controller-0
shell: >
{{ overcloud_working_dir }}/tripleo-director-ha-test-suite/TD-ha-test-suite.sh -t {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/test/test_check-failed-actions
{{ overcloud_working_dir }}/ha-test-suite/ha-test-suite.sh -t {{ overcloud_working_dir }}/ha-test-suite/test/test_check-failed-actions
register: test_ha_failed_actions_cmd
always:
- name: copy stdout test result to undercloud and check command
@ -54,7 +53,7 @@
- name: HA test - Master/Slave core resource stop and start (overcloud)
delegate_to: overcloud-controller-0
shell: >
{{ overcloud_working_dir }}/tripleo-director-ha-test-suite/TD-ha-test-suite.sh -t {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/test/test_master-slave -r {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/recovery/recovery_master-slave
{{ overcloud_working_dir }}/ha-test-suite/ha-test-suite.sh -t {{ overcloud_working_dir }}/ha-test-suite/test/test_master-slave -r {{ overcloud_working_dir }}/ha-test-suite/recovery/recovery_master-slave
register: test_ha_master_slave_cmd
always:
- name: copy stdout test result to undercloud and check command
@ -69,7 +68,7 @@
- name: HA test Keystone stop (overcloud)
delegate_to: overcloud-controller-0
shell: >
{{ overcloud_working_dir }}/tripleo-director-ha-test-suite/TD-ha-test-suite.sh -t {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/test/test_keystone-constraint-removal -r {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/recovery/recovery_keystone-constraint-removal
{{ overcloud_working_dir }}/ha-test-suite/ha-test-suite.sh -t {{ overcloud_working_dir }}/ha-test-suite/test/test_keystone-constraint-removal -r {{ overcloud_working_dir }}/ha-test-suite/recovery/recovery_keystone-constraint-removal
register: test_ha_keystone_stop_cmd
always:
- name: copy stdout test result to undercloud and check command
@ -84,7 +83,7 @@
- name: HA test Keystone removal (overcloud)
delegate_to: overcloud-controller-0
shell: >
{{ overcloud_working_dir }}/tripleo-director-ha-test-suite/TD-ha-test-suite.sh -t {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/test/test_keystone-constraint-removal -r {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/recovery/recovery_keystone-constraint-removal
{{ overcloud_working_dir }}/ha-test-suite/ha-test-suite.sh -t {{ overcloud_working_dir }}/ha-test-suite/test/test_keystone-constraint-removal -r {{ overcloud_working_dir }}/ha-test-suite/recovery/recovery_keystone-constraint-removal
register: test_ha_keystone_constraint_removal_cmd
always:
- name: copy stdout test result to undercloud and check command
@ -99,7 +98,7 @@
- name: HA test NG A (overcloud)
delegate_to: overcloud-controller-0
shell: >
{{ overcloud_working_dir }}/tripleo-director-ha-test-suite/TD-ha-test-suite.sh -t {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/test/test_pacemaker-light-a -r {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/recovery/recovery_pacemaker-light
{{ overcloud_working_dir }}/ha-test-suite/ha-test-suite.sh -t {{ overcloud_working_dir }}/ha-test-suite/test/test_pacemaker-light-a -r {{ overcloud_working_dir }}/ha-test-suite/recovery/recovery_pacemaker-light
register: test_ha_ng_a_cmd
always:
- name: copy stdout test result to undercloud and check command
@ -114,7 +113,7 @@
- name: HA test NG B (overcloud)
delegate_to: overcloud-controller-0
shell: >
{{ overcloud_working_dir }}/tripleo-director-ha-test-suite/TD-ha-test-suite.sh -t {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/test/test_pacemaker-light-b -r {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/recovery/recovery_pacemaker-light
{{ overcloud_working_dir }}/ha-test-suite/ha-test-suite.sh -t {{ overcloud_working_dir }}/ha-test-suite/test/test_pacemaker-light-b -r {{ overcloud_working_dir }}/ha-test-suite/recovery/recovery_pacemaker-light
register: test_ha_ng_b_cmd
always:
- name: copy stdout test result to undercloud and check command
@ -129,7 +128,7 @@
- name: HA test NG C (overcloud)
delegate_to: overcloud-controller-0
shell: >
{{ overcloud_working_dir }}/tripleo-director-ha-test-suite/TD-ha-test-suite.sh -t {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/test/test_pacemaker-light-c -r {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/recovery/recovery_pacemaker-light
{{ overcloud_working_dir }}/ha-test-suite/ha-test-suite.sh -t {{ overcloud_working_dir }}/ha-test-suite/test/test_pacemaker-light-c -r {{ overcloud_working_dir }}/ha-test-suite/recovery/recovery_pacemaker-light
register: test_ha_ng_c_cmd
always:
- name: copy stdout test result to undercloud and check command
@ -143,7 +142,7 @@
- block:
- name: HA Test instance deploy on the overcloud (undercloud)
shell: >
{{ working_dir }}/tripleo-director-ha-test-suite/TD-ha-test-suite.sh -t {{ working_dir }}/tripleo-director-ha-test-suite/test/test_instance-creation -r {{ working_dir }}/tripleo-director-ha-test-suite/recovery/recovery_instance-creation -u
{{ working_dir }}/ha-test-suite/ha-test-suite.sh -t {{ working_dir }}/ha-test-suite/test/test_instance-creation -r {{ working_dir }}/ha-test-suite/recovery/recovery_instance-creation -u
register: test_ha_instance_cmd
always:
- name: copy stdout test result to undercloud and check command

View File

@ -0,0 +1,145 @@
# OpenStack TripleO HA Test Suite
This project is a modular and a customizable test suite to be applied in an
Overcloud OpenStack environment deployed via TripleO upstream or Red Hat
OpenStack Director (OSPd).
## Usage
The script needs at least a test file (-t) which must contain the sequence of
the operations to be done. A recovery file (-r), with the sequence of the
operations needed to recovery the environment can also be passed. So a typical
invocation will be something like this:
```console
[heat-admin@overcloud-controller-0 overcloud-ha-test-suite]$ ./overcloud-ha-test-suite.sh -t test/test_keystone-constraint-removal -r recovery/recovery_keystone-constraint-removal
Fri May 20 15:27:19 UTC 2016 - Populationg overcloud elements...OK
Fri May 20 15:27:22 UTC 2016 - Test: Stop keystone resource (by stopping httpd), check no other resource is stopped
Fri May 20 15:27:22 UTC 2016 * Step 1: disable keystone resource via httpd stop
Fri May 20 15:27:22 UTC 2016 - Performing action disable on resource httpd ..OK
Fri May 20 15:27:26 UTC 2016 - List of cluster's failed actions:
Cluster is OK.
Fri May 20 15:27:29 UTC 2016 * Step 2: check resource status
Fri May 20 15:27:29 UTC 2016 - Cycling for 10 minutes polling every minute the status of the resources
Fri May 20 15:28:29 UTC 2016 - Polling...
delay -> OK
galera -> OK
...
...
openstack-sahara-engine -> OK
rabbitmq -> OK
redis -> OK
Fri May 20 15:41:00 UTC 2016 - List of cluster's failed actions:
Cluster is OK.
Fri May 20 15:41:03 UTC 2016 - Waiting 10 seconds to recover environment
Fri May 20 15:41:13 UTC 2016 - Recovery: Enable keystone via httpd and check for failed actions
Fri May 20 15:41:13 UTC 2016 * Step 1: enable keystone resource via httpd
Fri May 20 15:41:13 UTC 2016 - Performing action enable on resource httpd-clone OK
Fri May 20 15:41:15 UTC 2016 - List of cluster's failed actions:
Cluster is OK.
Fri May 20 15:41:17 UTC 2016 - End
```
The exit status will depend on the result of the operations. If a disable
operation fails, if failed actions will appear, if recovery does not ends with
success exit status will not be 0.
## Test and recoveries
Test and recovery are bash script portions that are
included inside the main script. Some functions and variables are available to
help on recurring operations. These functions are listed here:
- **check_failed_actions**: will print failed actions and return error in case
some of them are present;
- **check_resources_process_status**: will check for the process status of the
resources on the system (not in the cluster), i.e. will check if there is a
process for mysql daemon;
- **wait_resource_status**: will wail until a default timeout
($RESOURCE_CHANGE_STATUS_TIMEOUT) for a resource to reach a status;
- **check_resource_status**: will check a resource status, i.e. if you want to
check if httpd resource is started;
- **wait_cluster_start**: will wait the until a timeout
($RESOURCE_CHANGE_STATUS_TIMEOUT) to be started, specifically will wait for
all resources to be in state "Started";
- **play_on_resources**: will set the status of a resource;
The variables are:
- **OVERCLOUD_CORE_RESOURCES**: which are galera and rabbitmq
- **OVERCLOUD_RESOURCES**: which are *all* the resources
- **OVERCLOUD_SYSTEMD_RESOURCES**: which are the resources managed via systemd
by pacemaker;
And can be used in combination to wrote test and recovery files.
### Test file contents
A typical test file, say test/test_keystone-constraint-removal, will contain
something like this:
```bash
# Test: Stop keystone resource (by stopping httpd), check no other resource is stopped
echo "$(date) * Step 1: disable keystone resource via httpd stop"
play_on_resources "disable" "httpd"
echo "$(date) - List of cluster's failed actions:"
check_failed_actions
echo "$(date) * Step 2: check resource status"
# Define resource list without httpd
OVERCLOUD_RESOURCES_NO_KEYSTONE="$(echo $OVERCLOUD_RESOURCES | sed 's/httpd/ /g')"
# Define number of minutes to look for status
MINUTES=10
# Cycling for $MINUTES minutes polling every minute the status of the resources
echo "$(date) - Cycling for 10 minutes polling every minute the status of the resources"
i=0
while [ $i -lt $MINUTES ]
do
# Wait a minute
sleep 60
echo "$(date) - Polling..."
for resource in $OVERCLOUD_RESOURCES_NO_KEYSTONE
do
echo -n "$resource -> "
check_resource_status "$resource" "Started"
[ $? -eq 0 ] && echo "OK" || (FAILURES=1; echo "Error!")
done
let "i++"
done
echo "$(date) - List of cluster's failed actions:"
check_failed_actions
```
Code is commented and should be self explaining, but in short:
- the first commented line, after "# Test: " is read as test title;
- using play_on_resources it disables httpd resource;
- it checks for failed actions;
- it defines a list of variable named OVERCLOUD_RESOURCES_NO_KEYSTONE containing
all the variable but httpd;
- it cycles for 10 minutes, polling every minute the status of all the
resources;
If any of these steps for some reason fails, then the overall test will be
considered failed and the exit status will not be 0.
### Recovery file contents
A typical recovery file, say recovery/recovery_keystone-constraint-removal,
will contain something like this:
```bash
# Recovery: Enable keystone via httpd and check for failed actions
echo "$(date) * Step 1: enable keystone resource via httpd"
play_on_resources "enable" "httpd-clone"
echo "$(date) - List of cluster's failed actions:" check_failed_actions
```
Again:
- the first commented line, after "# Recovery: " is read as recovery title;
- using play_on_resources it enables httpd resource;
- it checks for failed actions;

View File

@ -0,0 +1,87 @@
#!/bin/bash
# Raoul Scarazzini (rasca@redhat.com)
# This script provides a testing suite for TripleO/Director OpenStack HA (so with Pacemaker) environments
# Define main workdir
WORKDIR=$(dirname $0)
# Source function library.
. $WORKDIR/include/functions
# Fixed parameters
# How much time wait in seconds for a resource to change status (i.e. from started to stopped)
RESOURCE_CHANGE_STATUS_TIMEOUT=600
# How much time wait in seconds before starting recovery
DEFAULT_RECOVERY_WAIT_TIME=10
# Command line parameters
if [ $# -gt 0 ]
then
while :; do
case $1 in
-h|-\?|--help)
usage
exit
;;
-u|--undercloud)
undercloud=true
shift
;;
-t|--test)
test_sequence="$2"
shift
;;
-r|--recover)
recovery_sequence="$2"
shift
;;
--)
shift
break
;;
-?*)
usage
exit 1
;;
*)
break
esac
shift
done
else
usage
exit 1
fi
# Populating overcloud elements if not on undercloud
if [ "$undercloud" != true ]
then
echo -n "$(date) - Populationg overcloud elements..."
OVERCLOUD_CORE_RESOURCES="galera redis rabbitmq"
OVERCLOUD_RESOURCES=$(sudo pcs resource show | egrep '^ (C|[a-Z])' | sed 's/.* \[\(.*\)\]/\1/g' | sed 's/ \(.*\)(.*):.*/\1/g' | sort)
OVERCLOUD_SYSTEMD_RESOURCES=$(sudo pcs config show | egrep "Resource:.*systemd"|grep -v "haproxy"|awk '{print $2}')
echo "OK"
fi
if [ -f "$test_sequence" ]
then
echo "$(date) - Test: $(grep '^#.*Test:' $test_sequence | sed 's/^#.*Test: //')"
. $test_sequence
else
echo "No test file passed or unable to read test file."
fi
if [ -f "$recovery_sequence" ]
then
echo "$(date) - Waiting $DEFAULT_RECOVERY_WAIT_TIME seconds to recover environment"
sleep $DEFAULT_RECOVERY_WAIT_TIME
echo "$(date) - Recovery: $(grep '^#.*Recovery:' $recovery_sequence | sed 's/^#.*Recovery: //')"
. $recovery_sequence
else
echo "No recovery file passed or unable to read recovery file."
fi
echo "$(date) - End"

View File

@ -0,0 +1,151 @@
# Raoul Scarazzini (rasca@redhat.com)
# This script provides a testing suite from TripleO/Directory OpenStack HA (so
# with Pacemaker) environments functions to be used inside TripleO/Director
# OpenStack HA environments
function usage {
echo "Usage $0 -t <testfile> [-r <recover file>] [-u]
-t, --test <testfile> Specify which file contains the test to run
-r, --recover <recoverfile> Specify which file (if any) should be used for recovery
-u, --undercloud Test will be performed on undercloud
"
}
function check_failed_actions {
resource=$1
sudo pcs status | grep "Failed Actions:" &> /dev/null
if [ $? -eq 0 ]
then
if [ "x$resource" == "x" ]
then
echo "Cluster has failed actions:"
sudo pcs status | sed -n -e '/Failed Actions:/,/^$/p' | egrep 'OCF_|not running|unknown' | awk '{print $2}' | cut -f1 -d_ | sort |uniq
exit 1
else
errors=$(sudo pcs status | sed -n -e '/Failed Actions:/,/^$/p' | grep -A1 $resource)
if [ $? -eq 0 ]
then
echo "Resource $resource has failed actions:"
echo $errors
exit 1
else
echo "No failed actions for $resource."
return 0
fi
fi
else
[ "x$resource" == "x" ] && echo "Cluster is OK." || echo "No failed actions for $resource."
return 0
fi
}
function check_resources_process_status {
for resource in $OVERCLOUD_RESOURCES
do
echo -n "$resource -> "
case $resource in
ip-*) #ip_addr=$(pcs resource show $resource | grep Attributes | sed 's/.*ip=\(.*\) cidr.*/\1/g')
ip_addr=$(echo $resource | sed 's/ip-//g')
sudo ip a s | grep $ip_addr &> /dev/null
;;
rabbitmq) sudo /usr/sbin/rabbitmqctl cluster_status &> /dev/null
;;
redis) pidof /usr/bin/redis-server &> /dev/null
;;
galera) pidof /usr/libexec/mysqld &> /dev/null
;;
*cleanup*|delay) echo -n "no need to check if it's "
;;
*) systemctl is-active $resource &> /dev/null
;;
esac
[ $? -eq 0 ] && echo "active" || echo "inactive"
done
}
function wait_resource_status {
resource=$1
status=$2
i=1
while [ $i -lt $RESOURCE_CHANGE_STATUS_TIMEOUT ]
do
output=$(sudo pcs status resources | sed -n -e "/\(Clone\|Master\/Slave\) Set: .*\[$resource\]/,/^ [a-Z]/p" | head -n -1 | tail -n +2 | egrep -v "$status\:")
if [ "x$output" == "x" ]
then
return 0
break
else
echo -n "."
sleep 1
let "i++"
fi
done
check_failed_actions
exit 1
}
function check_resource_status {
resource=$1
status=$2
output=$(sudo pcs status resources | sed -n -e "/\(Clone\|Master\/Slave\) Set: .*\[$resource\]/,/^ [a-Z]/p" | head -n -1 | tail -n +2 | egrep -v "$status\:")
# Since we are checking a specific status, if we have output from above it
# means that for some reason the resource is not in the state we are expecting
[ "x$output" == "x" ] && return 0 || (check_failed_actions; exit 1)
}
function wait_cluster_start {
i=1
while true; do
[ $i -eq $RESOURCE_CHANGE_STATUS_TIMEOUT ] && break
# Check for failed actions
sudo pcs status | egrep "Failed" &> /dev/null
[ $? -eq 0 ] && break
# If we have stopped resources let's wait
sudo pcs status | egrep "Stopped" &> /dev/null
if [ $? -eq 0 ]
then
echo -n "."
else
echo "All cluster resources are started."
return 0
break
fi
sleep 1
let "i++"
done
# If we are here than we have problems: we hit timeout or we still have
# stopped resources
echo "Problems found. There are stopped or failed resources!"
check_failed_actions
exit 1
}
function play_on_resources {
action=$1
resources=$2
for resource in $resources
do
echo -n "$(date) - Performing action $action on resource $resource "
# Do the action on the resource
sudo pcs resource $action $resource --wait $RESOURCE_CHANGE_STATUS_TIMEOUT
if [ $? -ne 0 ]
then
echo "FAILURE!"
check_failed_actions $resource
exit 1
else
echo "OK"
fi
done
return 0
}

View File

@ -0,0 +1,13 @@
# Recovery: Enable all systemd and core resources, cleanup failed actions
echo "$(date) * Step 1: enable all the cluster resources"
play_on_resources "enable" "$OVERCLOUD_RESOURCES"
echo "$(date) * Step 2: Cleaning up failed resources"
sudo pcs status | sed -n -e '/Failed Actions:/,/^$/p' | egrep 'OCF_TIMEOUT|not running' | awk '{print $2}' | cut -f1 -d_ | sort | uniq | while read RES; do echo "Cleaning $RES"; sudo pcs resource cleanup $RES; done
echo "$(date) * Step 3: Waiting all resources to start"
wait_cluster_start
echo "$(date) - List of cluster's failed actions:"
check_failed_actions

View File

@ -0,0 +1,27 @@
# Recovery: Instance creation
STACKDIR=/home/stack
INSTANCE_NAME=cirros-1
source $STACKDIR/overcloudrc
floatingip=$(nova list | grep $INSTANCE_NAME | awk '{print $13}')
floatingip_id=$(neutron floatingip-list | grep $floatingip | awk '{print $2}')
port_id=$(neutron port-list | grep $floatingip | awk '{print $2}')
neutron floatingip-disassociate $floatingip_id $port_id
neutron floatingip-delete $floatingip_id
nova delete $INSTANCE_NAME
projectid=$(openstack project list | awk '/admin/ {print $2}')
glance --os-project-id=$projectid image-delete $(glance --os-project-id=$projectid image-list | grep CirrOS | awk '{print $2}')
rm /tmp/cirros-0.3.4-x86_64-disk.img
nova flavor-delete overcloud-instance-test-small-flavor
neutron router-gateway-clear floating-router floating-network
neutron router-interface-delete floating-router private-subnet
neutron router-delete floating-router
neutron security-group-delete pingandssh
neutron subnet-delete private-subnet
neutron subnet-delete floating-subnet
neutron net-delete floating-network
neutron net-delete private-network

View File

@ -0,0 +1,7 @@
# Recovery: Enable keystone via httpd and check for failed actions
echo "$(date) * Step 1: enable keystone resource via httpd"
play_on_resources "enable" "httpd-clone"
echo "$(date) - List of cluster's failed actions:"
check_failed_actions

View File

@ -0,0 +1,10 @@
# Recovery: Enable openstack-keystone and check for failed actions
echo "$(date) * Step 1: enable openstack-keystone resource"
play_on_resources "enable" "openstack-keystone-clone"
echo "$(date) - Checking for Stopped resources:"
wait_cluster_start
echo "$(date) - List of cluster's failed actions:"
check_failed_actions

View File

@ -0,0 +1,7 @@
# Recovery: Enable master slave resources (galera and redis), all the resources should come up
echo "$(date) * Step 1: enable galera, redis and rabbitmq"
play_on_resources "enable" "$OVERCLOUD_CORE_RESOURCES"
echo "$(date) - List of cluster's failed actions:"
check_failed_actions

View File

@ -0,0 +1,7 @@
# Recovery: Enable mongo and check for failed actions
echo "$(date) * Step 1: enable mongo"
play_on_resources "enable" "mongo"
echo "$(date) - List of cluster's failed actions:"
check_failed_actions

View File

@ -0,0 +1,13 @@
# Recovery: Enable all systemd and core resources, cleanup failed actions
echo "$(date) * Step 1: enable core resources"
play_on_resources "enable" "$OVERCLOUD_CORE_RESOURCES"
echo "$(date) * Step 2: enable all the systemd resources"
play_on_resources "enable" "$OVERCLOUD_SYSTEMD_RESOURCES"
echo "$(date) * Step 3: Waiting all resources to start"
wait_cluster_start
echo "$(date) - List of cluster's failed actions:"
check_failed_actions

View File

@ -0,0 +1,10 @@
# Recovery: Start cluster again
echo "$(date) * Step 1: start the cluster"
sudo pcs cluster start --all
echo "$(date) * Step 2: Waiting all resources to start"
wait_cluster_start
echo "$(date) - List of cluster's failed actions:"
check_failed_actions

View File

@ -0,0 +1,3 @@
# Test: Wait cluster start and look for failed actions
echo "$(date) - Waiting for cluster start and checking for failed resources:"
wait_cluster_start

View File

@ -0,0 +1,112 @@
# Test: Instance deployment
set -e
STACKDIR=/home/stack
source $STACKDIR/overcloudrc
# Load the environment with all the info for floating network
source $STACKDIR/environment
TIMEOUT=30
INSTANCE_NAME=cirros-1
PRIVATE_NETWORK=10.1.1.0/24
# Gateway creation
# When the environment is recovering from a previous test it can happen
# that neutron is waiting to rejoin its cluster, preventing from creating
# new stuff like router. We wait at least 300 seconds before giving up.
set +e
RUN=1
while [ $RUN -lt $TIMEOUT ]
do
neutron router-create floating-router
if [ $? -eq 0 ]
then
break
else
echo "Waiting..."
let "RUN++"
sleep 10
fi
done
# If timeout was reached then we need to exit with error
if [ $RUN -ge $TIMEOUT ]
then
echo "It was not possible to create the router, giving up."
exit 1
fi
set -e
# Network and subnet creation
neutron net-create floating-network --router:external=True --provider:physical_network $FLOATING_PHYSICAL_NET --provider:network_type flat
neutron subnet-create --name floating-subnet --disable-dhcp --allocation-pool start=$FLOATING_RANGE_START,end=$FLOATING_RANGE_END --gateway $FLOATING_GW floating-network $FLOATING_SUBNET
neutron net-create private-network
neutron subnet-create private-network $PRIVATE_NETWORK --name private-subnet
# Router configuration
neutron router-interface-add floating-router private-subnet
neutron router-gateway-set floating-router floating-network
neutron security-group-create pingandssh
securitygroup_id=$(neutron security-group-list | grep pingandssh | head -1 | awk '{print $2}')
neutron security-group-rule-create --direction ingress --protocol tcp --port-range-min 22 --port-range-max 22 $securitygroup_id
neutron security-group-rule-create --protocol icmp --direction ingress $securitygroup_id
floatingip=$(neutron floatingip-create floating-network | grep floating_ip_address | awk '{print $4}')
echo floatingip=$floatingip
#[stack@mrg-06 ~]$ neutron net-list
#...
#| 6fde7d2a-e2d9-4b0f-a982-b7cbc3244807 | private-network | 31a5ccd5-07bd-4103-a4a3-ab2c6d6148d7 10.1.1.0/24 |
#...
nova flavor-create --ephemeral 0 --is-public True test.small overcloud-instance-test-small-flavor 2048 20 1
private_net_id=$(neutron net-list | grep private-network | awk '{print $2}')
wget -O /tmp/cirros-0.3.4-x86_64-disk.img http://download.cirros-cloud.net/0.3.4/cirros-0.3.4-x86_64-disk.img
projectid=$(openstack project list | awk '/admin/ {print $2}')
glance --os-project-id=$projectid image-create --name CirrOS --container-format bare --disk-format raw --file /tmp/cirros-0.3.4-x86_64-disk.img
nova boot --image CirrOS --flavor test.small --security-groups pingandssh --nic net-id=$private_net_id $INSTANCE_NAME
#[stack@mrg-06 ~]$ nova list
#...
#| eb29c1a1-c30e-4f8f-91ea-cec1fd38c088 | $INSTANCE_NAME | BUILD | spawning | NOSTATE | private-network=10.1.1.5 |
#...
sleep 5
instance_ip=$(nova list | grep $INSTANCE_NAME | awk '{print $12}' | sed "s/private-network=//g")
echo instance_ip=$instance_ip
#[stack@mrg-06 ~]$ neutron port-list
#...
#| 61ce215d-3dc7-4873-af73-342620cdc3b6 | | fa:16:3e:8d:8b:8d | {"subnet_id": "31a5ccd5-07bd-4103-a4a3-ab2c6d6148d7", "ip_address": "10.1.1.5"} |
#...
port_id=$(neutron port-list | grep $instance_ip | awk '{print $2}')
echo port_id=$port_id
#[stack@mrg-06 ~]$ neutron floatingip-list
#...
#| 624f5256-ee89-438f-8335-904017e74a18 | | 10.16.144.77 | |
#...
floatingip_id=$(neutron floatingip-list | grep $floatingip | awk '{print $2}')
echo floatingip_id=$floatingip_id
neutron floatingip-associate $floatingip_id $port_id
echo "------------------------------------------------------------"
echo "$(date) Instance will be available at the IP $floatingip"
echo "------------------------------------------------------------"
set +e
COUNTER=1
while [ $COUNTER -lt $TIMEOUT ]
do
ping -c1 $floatingip 2>&1 > /dev/null
if [ $? -eq 0 ]
then
echo "SUCCESS"
break
else
echo -n "."
fi
let COUNTER=COUNTER+1
done
[ $COUNTER -ge $TIMEOUT ] && (echo "FAILURE!"; exit 1)

View File

@ -0,0 +1,40 @@
# Test: Stop keystone resource (by stopping httpd), check no other resource is stopped
echo "$(date) * Step 1: disable keystone resource via httpd stop"
play_on_resources "disable" "httpd"
echo "$(date) - List of cluster's failed actions:"
check_failed_actions
echo "$(date) * Step 2: check resource status"
# Define resource list without httpd
OVERCLOUD_RESOURCES_NO_KEYSTONE="$(echo $OVERCLOUD_RESOURCES | sed 's/httpd/ /g')"
# Define number of minutes to look for status
MINUTES=10
# Cycling for $MINUTES minutes polling every minute the status of the resources
echo "$(date) - Cycling for 10 minutes polling every minute the status of the resources"
i=0
while [ $i -lt $MINUTES ]
do
# Wait a minute
sleep 60
echo "$(date) - Polling..."
for resource in $OVERCLOUD_RESOURCES_NO_KEYSTONE
do
echo -n "$resource -> "
# If the resource is a multi state like galera or redis, do a different check
case $resource in
"galera") check_resource_status "$resource" "Masters"
;;
"redis") check_resource_status "$resource" "(Masters|Slaves)"
;;
*) check_resource_status "$resource" "Started"
;;
esac
[ $? -eq 0 ] && echo "OK" || (FAILURES=1; echo "Error!"; break)
done
let "i++"
done
echo "$(date) - List of cluster's failed actions:"
check_failed_actions

View File

@ -0,0 +1,7 @@
# Test: Stop openstack-keystone and look for failed actions
echo "$(date) * Step 1: disable openstack-keystone resource"
play_on_resources "disable" "openstack-keystone-clone"
echo "$(date) - List of cluster's failed actions:"
check_failed_actions

View File

@ -0,0 +1,7 @@
# Test: Stop master slave resources (galera and redis), all the resources should come down
echo "$(date) * Step 1: disable galera, redis and rabbitmq"
play_on_resources "disable" "$OVERCLOUD_CORE_RESOURCES"
echo "$(date) - List of cluster's failed actions:"
check_failed_actions

View File

@ -0,0 +1,43 @@
# Test: Stop mongo resource, check related systemd resources are fine
echo "$(date) * Step 1: disable mongo"
play_on_resources "disable" "mongo"
echo "$(date) - List of cluster's failed actions:"
check_failed_actions
echo "$(date) * Step 2: check resource status"
# Define related resources
OVERCLOUD_RESOURCES="openstack-aodh-evaluator openstack-aodh-listener openstack-aodh-notifier openstack-ceilometer-central.service openstack-ceilometer-collector.service openstack-ceilometer-notification.service"
# Define number of minutes to look for status
MINUTES=10
# Cycling for $MINUTES minutes polling every minute the status of the resources
echo "$(date) - Cycling for 10 minutes polling every minute the status of the resources"
i=0
while [ $i -lt $MINUTES ]
do
# Wait a minute
sleep 60
echo "$(date) - Polling..."
for resource in $OVERCLOUD_RESOURCES
do
echo -n "$resource -> "
# Check if the resource is active for the system
systemctl is-active $resource
if [ $? -ne 0 ]
then
# Show status of the resource
echo "Error! Resource $resource is not active anymore."
systemctl status $resource
# Check in any case cluster's failed actions
echo "$(date) - List of cluster's failed actions:"
check_failed_actions
# Now exit with an error
exit 1
fi
done
let "i++"
done
# If we are here, test was successful
echo "$(date) - Test was successful"

View File

@ -0,0 +1,19 @@
# Test: Stop every systemd resource, stop Galera and Rabbitmq, Start every systemd resource
echo "$(date) * Step 1: disable all the systemd resources"
play_on_resources "disable" "$OVERCLOUD_SYSTEMD_RESOURCES"
echo "$(date) - List of cluster's failed actions:"
check_failed_actions
echo "$(date) * Step 2: disable core services"
play_on_resources "disable" "$OVERCLOUD_CORE_RESOURCES"
echo "$(date) - List of cluster's failed actions:"
check_failed_actions
echo "$(date) * Step 3: enable each resource one by one and check the status"
play_on_resources "enable" "$OVERCLOUD_SYSTEMD_RESOURCES"
echo "$(date) - List of cluster's failed actions:"
check_failed_actions

View File

@ -0,0 +1,19 @@
# Test: Stop Galera and Rabbitmq, stop every systemd resource, Start every systemd resource
echo "$(date) * Step 1: disable core services"
play_on_resources "disable" "$OVERCLOUD_CORE_RESOURCES"
echo "$(date) - List of cluster's failed actions:"
check_failed_actions
echo "$(date) * Step 2: disable all the systemd resources"
play_on_resources "disable" "$OVERCLOUD_SYSTEMD_RESOURCES"
echo "$(date) - List of cluster's failed actions:"
check_failed_actions
echo "$(date) * Step 3: enable all the systemd resources"
play_on_resources "enable" "$OVERCLOUD_SYSTEMD_RESOURCES"
echo "$(date) - List of cluster's failed actions:"
check_failed_actions

View File

@ -0,0 +1,22 @@
# Test: Stop Galera and Rabbitmq, wait 20 minutes to see if something fails
echo "$(date) * Step 1: disable core services"
play_on_resources "disable" "$OVERCLOUD_CORE_RESOURCES"
echo "$(date) - List of cluster's failed actions:"
check_failed_actions
echo "$(date) * Step 2: poll every minute for twenty minutes the status of the resources"
for i in $(seq 1 20)
do
check_failed_actions
if [ $? -ne 0 ]
then
echo "Errors found, test is over."
break
fi
sleep 60
done
echo "$(date) - List of cluster's failed actions:"
check_failed_actions

View File

@ -0,0 +1,10 @@
# Test: Check active processes after cluster stop
echo "$(date) * Step 1: checking actual process status"
check_resources_process_status
echo "$(date) * Step 2: stopping cluster"
sudo pcs cluster stop --all
echo "$(date) * Step 3: checking actual process status"
check_resources_process_status