Integrate ha-test-suite into the repo
This commit puts the ha-test-suite under the tools directory instead of having it downloaded from the outside (originally it was available at the link https://github.com/rscarazz/tripleo-director-ha-test-suite. At code level now the suite is copied directly from the local path tools/ha-test-suite and the executable now is ha-test-suite.sh. Change-Id: I087bc28a0afa3ede9b2fb698892b8306f56790a2
This commit is contained in:
parent
98593f6628
commit
d9e9613a8b
@ -17,14 +17,13 @@
|
||||
"{{ working_dir }}/workarounds.sh"
|
||||
when: apply_workarounds
|
||||
|
||||
- name: Get overcloud-ha-test-suite on undercloud and controllers
|
||||
- name: Copy ha-test-suite on undercloud and controllers
|
||||
shell: >
|
||||
rm -rf tripleo-director-ha-test-suite;
|
||||
git clone https://github.com/rscarazz/tripleo-director-ha-test-suite/ tripleo-director-ha-test-suite;
|
||||
delegate_to: "{{ item }}"
|
||||
/usr/bin/rsync --delay-updates -F --compress --archive -e 'ssh -F {{ local_working_dir }}/ssh.config.ansible' {{ local_working_dir }}/tripleo-quickstart-utils/tools/ha-test-suite {{ item }}:
|
||||
delegate_to: "localhost"
|
||||
with_items:
|
||||
- "undercloud"
|
||||
- "{{ groups['controller'] }}"
|
||||
- "undercloud"
|
||||
|
||||
- name: Include test sequence depending on release
|
||||
include_vars:
|
||||
@ -39,7 +38,7 @@
|
||||
- name: HA test - Failed actions (overcloud)
|
||||
delegate_to: overcloud-controller-0
|
||||
shell: >
|
||||
{{ overcloud_working_dir }}/tripleo-director-ha-test-suite/TD-ha-test-suite.sh -t {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/test/test_check-failed-actions
|
||||
{{ overcloud_working_dir }}/ha-test-suite/ha-test-suite.sh -t {{ overcloud_working_dir }}/ha-test-suite/test/test_check-failed-actions
|
||||
register: test_ha_failed_actions_cmd
|
||||
always:
|
||||
- name: copy stdout test result to undercloud and check command
|
||||
@ -54,7 +53,7 @@
|
||||
- name: HA test - Master/Slave core resource stop and start (overcloud)
|
||||
delegate_to: overcloud-controller-0
|
||||
shell: >
|
||||
{{ overcloud_working_dir }}/tripleo-director-ha-test-suite/TD-ha-test-suite.sh -t {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/test/test_master-slave -r {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/recovery/recovery_master-slave
|
||||
{{ overcloud_working_dir }}/ha-test-suite/ha-test-suite.sh -t {{ overcloud_working_dir }}/ha-test-suite/test/test_master-slave -r {{ overcloud_working_dir }}/ha-test-suite/recovery/recovery_master-slave
|
||||
register: test_ha_master_slave_cmd
|
||||
always:
|
||||
- name: copy stdout test result to undercloud and check command
|
||||
@ -69,7 +68,7 @@
|
||||
- name: HA test Keystone stop (overcloud)
|
||||
delegate_to: overcloud-controller-0
|
||||
shell: >
|
||||
{{ overcloud_working_dir }}/tripleo-director-ha-test-suite/TD-ha-test-suite.sh -t {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/test/test_keystone-constraint-removal -r {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/recovery/recovery_keystone-constraint-removal
|
||||
{{ overcloud_working_dir }}/ha-test-suite/ha-test-suite.sh -t {{ overcloud_working_dir }}/ha-test-suite/test/test_keystone-constraint-removal -r {{ overcloud_working_dir }}/ha-test-suite/recovery/recovery_keystone-constraint-removal
|
||||
register: test_ha_keystone_stop_cmd
|
||||
always:
|
||||
- name: copy stdout test result to undercloud and check command
|
||||
@ -84,7 +83,7 @@
|
||||
- name: HA test Keystone removal (overcloud)
|
||||
delegate_to: overcloud-controller-0
|
||||
shell: >
|
||||
{{ overcloud_working_dir }}/tripleo-director-ha-test-suite/TD-ha-test-suite.sh -t {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/test/test_keystone-constraint-removal -r {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/recovery/recovery_keystone-constraint-removal
|
||||
{{ overcloud_working_dir }}/ha-test-suite/ha-test-suite.sh -t {{ overcloud_working_dir }}/ha-test-suite/test/test_keystone-constraint-removal -r {{ overcloud_working_dir }}/ha-test-suite/recovery/recovery_keystone-constraint-removal
|
||||
register: test_ha_keystone_constraint_removal_cmd
|
||||
always:
|
||||
- name: copy stdout test result to undercloud and check command
|
||||
@ -99,7 +98,7 @@
|
||||
- name: HA test NG A (overcloud)
|
||||
delegate_to: overcloud-controller-0
|
||||
shell: >
|
||||
{{ overcloud_working_dir }}/tripleo-director-ha-test-suite/TD-ha-test-suite.sh -t {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/test/test_pacemaker-light-a -r {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/recovery/recovery_pacemaker-light
|
||||
{{ overcloud_working_dir }}/ha-test-suite/ha-test-suite.sh -t {{ overcloud_working_dir }}/ha-test-suite/test/test_pacemaker-light-a -r {{ overcloud_working_dir }}/ha-test-suite/recovery/recovery_pacemaker-light
|
||||
register: test_ha_ng_a_cmd
|
||||
always:
|
||||
- name: copy stdout test result to undercloud and check command
|
||||
@ -114,7 +113,7 @@
|
||||
- name: HA test NG B (overcloud)
|
||||
delegate_to: overcloud-controller-0
|
||||
shell: >
|
||||
{{ overcloud_working_dir }}/tripleo-director-ha-test-suite/TD-ha-test-suite.sh -t {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/test/test_pacemaker-light-b -r {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/recovery/recovery_pacemaker-light
|
||||
{{ overcloud_working_dir }}/ha-test-suite/ha-test-suite.sh -t {{ overcloud_working_dir }}/ha-test-suite/test/test_pacemaker-light-b -r {{ overcloud_working_dir }}/ha-test-suite/recovery/recovery_pacemaker-light
|
||||
register: test_ha_ng_b_cmd
|
||||
always:
|
||||
- name: copy stdout test result to undercloud and check command
|
||||
@ -129,7 +128,7 @@
|
||||
- name: HA test NG C (overcloud)
|
||||
delegate_to: overcloud-controller-0
|
||||
shell: >
|
||||
{{ overcloud_working_dir }}/tripleo-director-ha-test-suite/TD-ha-test-suite.sh -t {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/test/test_pacemaker-light-c -r {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/recovery/recovery_pacemaker-light
|
||||
{{ overcloud_working_dir }}/ha-test-suite/ha-test-suite.sh -t {{ overcloud_working_dir }}/ha-test-suite/test/test_pacemaker-light-c -r {{ overcloud_working_dir }}/ha-test-suite/recovery/recovery_pacemaker-light
|
||||
register: test_ha_ng_c_cmd
|
||||
always:
|
||||
- name: copy stdout test result to undercloud and check command
|
||||
@ -143,7 +142,7 @@
|
||||
- block:
|
||||
- name: HA Test instance deploy on the overcloud (undercloud)
|
||||
shell: >
|
||||
{{ working_dir }}/tripleo-director-ha-test-suite/TD-ha-test-suite.sh -t {{ working_dir }}/tripleo-director-ha-test-suite/test/test_instance-creation -r {{ working_dir }}/tripleo-director-ha-test-suite/recovery/recovery_instance-creation -u
|
||||
{{ working_dir }}/ha-test-suite/ha-test-suite.sh -t {{ working_dir }}/ha-test-suite/test/test_instance-creation -r {{ working_dir }}/ha-test-suite/recovery/recovery_instance-creation -u
|
||||
register: test_ha_instance_cmd
|
||||
always:
|
||||
- name: copy stdout test result to undercloud and check command
|
||||
|
145
tools/ha-test-suite/README.md
Normal file
145
tools/ha-test-suite/README.md
Normal file
@ -0,0 +1,145 @@
|
||||
# OpenStack TripleO HA Test Suite
|
||||
|
||||
This project is a modular and a customizable test suite to be applied in an
|
||||
Overcloud OpenStack environment deployed via TripleO upstream or Red Hat
|
||||
OpenStack Director (OSPd).
|
||||
|
||||
## Usage
|
||||
|
||||
The script needs at least a test file (-t) which must contain the sequence of
|
||||
the operations to be done. A recovery file (-r), with the sequence of the
|
||||
operations needed to recovery the environment can also be passed. So a typical
|
||||
invocation will be something like this:
|
||||
|
||||
```console
|
||||
[heat-admin@overcloud-controller-0 overcloud-ha-test-suite]$ ./overcloud-ha-test-suite.sh -t test/test_keystone-constraint-removal -r recovery/recovery_keystone-constraint-removal
|
||||
Fri May 20 15:27:19 UTC 2016 - Populationg overcloud elements...OK
|
||||
Fri May 20 15:27:22 UTC 2016 - Test: Stop keystone resource (by stopping httpd), check no other resource is stopped
|
||||
Fri May 20 15:27:22 UTC 2016 * Step 1: disable keystone resource via httpd stop
|
||||
Fri May 20 15:27:22 UTC 2016 - Performing action disable on resource httpd ..OK
|
||||
Fri May 20 15:27:26 UTC 2016 - List of cluster's failed actions:
|
||||
Cluster is OK.
|
||||
Fri May 20 15:27:29 UTC 2016 * Step 2: check resource status
|
||||
Fri May 20 15:27:29 UTC 2016 - Cycling for 10 minutes polling every minute the status of the resources
|
||||
Fri May 20 15:28:29 UTC 2016 - Polling...
|
||||
delay -> OK
|
||||
galera -> OK
|
||||
...
|
||||
...
|
||||
openstack-sahara-engine -> OK
|
||||
rabbitmq -> OK
|
||||
redis -> OK
|
||||
Fri May 20 15:41:00 UTC 2016 - List of cluster's failed actions:
|
||||
Cluster is OK.
|
||||
Fri May 20 15:41:03 UTC 2016 - Waiting 10 seconds to recover environment
|
||||
Fri May 20 15:41:13 UTC 2016 - Recovery: Enable keystone via httpd and check for failed actions
|
||||
Fri May 20 15:41:13 UTC 2016 * Step 1: enable keystone resource via httpd
|
||||
Fri May 20 15:41:13 UTC 2016 - Performing action enable on resource httpd-clone OK
|
||||
Fri May 20 15:41:15 UTC 2016 - List of cluster's failed actions:
|
||||
Cluster is OK.
|
||||
Fri May 20 15:41:17 UTC 2016 - End
|
||||
```
|
||||
|
||||
The exit status will depend on the result of the operations. If a disable
|
||||
operation fails, if failed actions will appear, if recovery does not ends with
|
||||
success exit status will not be 0.
|
||||
|
||||
## Test and recoveries
|
||||
|
||||
Test and recovery are bash script portions that are
|
||||
included inside the main script. Some functions and variables are available to
|
||||
help on recurring operations. These functions are listed here:
|
||||
|
||||
- **check_failed_actions**: will print failed actions and return error in case
|
||||
some of them are present;
|
||||
- **check_resources_process_status**: will check for the process status of the
|
||||
resources on the system (not in the cluster), i.e. will check if there is a
|
||||
process for mysql daemon;
|
||||
- **wait_resource_status**: will wail until a default timeout
|
||||
($RESOURCE_CHANGE_STATUS_TIMEOUT) for a resource to reach a status;
|
||||
- **check_resource_status**: will check a resource status, i.e. if you want to
|
||||
check if httpd resource is started;
|
||||
- **wait_cluster_start**: will wait the until a timeout
|
||||
($RESOURCE_CHANGE_STATUS_TIMEOUT) to be started, specifically will wait for
|
||||
all resources to be in state "Started";
|
||||
- **play_on_resources**: will set the status of a resource;
|
||||
|
||||
The variables are:
|
||||
|
||||
- **OVERCLOUD_CORE_RESOURCES**: which are galera and rabbitmq
|
||||
- **OVERCLOUD_RESOURCES**: which are *all* the resources
|
||||
- **OVERCLOUD_SYSTEMD_RESOURCES**: which are the resources managed via systemd
|
||||
by pacemaker;
|
||||
|
||||
And can be used in combination to wrote test and recovery files.
|
||||
|
||||
### Test file contents
|
||||
|
||||
A typical test file, say test/test_keystone-constraint-removal, will contain
|
||||
something like this:
|
||||
|
||||
```bash
|
||||
# Test: Stop keystone resource (by stopping httpd), check no other resource is stopped
|
||||
|
||||
echo "$(date) * Step 1: disable keystone resource via httpd stop"
|
||||
play_on_resources "disable" "httpd"
|
||||
|
||||
echo "$(date) - List of cluster's failed actions:"
|
||||
check_failed_actions
|
||||
|
||||
echo "$(date) * Step 2: check resource status"
|
||||
# Define resource list without httpd
|
||||
OVERCLOUD_RESOURCES_NO_KEYSTONE="$(echo $OVERCLOUD_RESOURCES | sed 's/httpd/ /g')"
|
||||
# Define number of minutes to look for status
|
||||
MINUTES=10
|
||||
# Cycling for $MINUTES minutes polling every minute the status of the resources
|
||||
echo "$(date) - Cycling for 10 minutes polling every minute the status of the resources"
|
||||
i=0
|
||||
while [ $i -lt $MINUTES ]
|
||||
do
|
||||
# Wait a minute
|
||||
sleep 60
|
||||
echo "$(date) - Polling..."
|
||||
for resource in $OVERCLOUD_RESOURCES_NO_KEYSTONE
|
||||
do
|
||||
echo -n "$resource -> "
|
||||
check_resource_status "$resource" "Started"
|
||||
[ $? -eq 0 ] && echo "OK" || (FAILURES=1; echo "Error!")
|
||||
done
|
||||
let "i++"
|
||||
done
|
||||
|
||||
echo "$(date) - List of cluster's failed actions:"
|
||||
check_failed_actions
|
||||
```
|
||||
|
||||
Code is commented and should be self explaining, but in short:
|
||||
- the first commented line, after "# Test: " is read as test title;
|
||||
- using play_on_resources it disables httpd resource;
|
||||
- it checks for failed actions;
|
||||
- it defines a list of variable named OVERCLOUD_RESOURCES_NO_KEYSTONE containing
|
||||
all the variable but httpd;
|
||||
- it cycles for 10 minutes, polling every minute the status of all the
|
||||
resources;
|
||||
|
||||
If any of these steps for some reason fails, then the overall test will be
|
||||
considered failed and the exit status will not be 0.
|
||||
|
||||
### Recovery file contents
|
||||
|
||||
A typical recovery file, say recovery/recovery_keystone-constraint-removal,
|
||||
will contain something like this:
|
||||
|
||||
```bash
|
||||
# Recovery: Enable keystone via httpd and check for failed actions
|
||||
|
||||
echo "$(date) * Step 1: enable keystone resource via httpd"
|
||||
play_on_resources "enable" "httpd-clone"
|
||||
|
||||
echo "$(date) - List of cluster's failed actions:" check_failed_actions
|
||||
```
|
||||
|
||||
Again:
|
||||
- the first commented line, after "# Recovery: " is read as recovery title;
|
||||
- using play_on_resources it enables httpd resource;
|
||||
- it checks for failed actions;
|
87
tools/ha-test-suite/ha-test-suite.sh
Executable file
87
tools/ha-test-suite/ha-test-suite.sh
Executable file
@ -0,0 +1,87 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Raoul Scarazzini (rasca@redhat.com)
|
||||
# This script provides a testing suite for TripleO/Director OpenStack HA (so with Pacemaker) environments
|
||||
|
||||
# Define main workdir
|
||||
WORKDIR=$(dirname $0)
|
||||
|
||||
# Source function library.
|
||||
. $WORKDIR/include/functions
|
||||
|
||||
# Fixed parameters
|
||||
# How much time wait in seconds for a resource to change status (i.e. from started to stopped)
|
||||
RESOURCE_CHANGE_STATUS_TIMEOUT=600
|
||||
# How much time wait in seconds before starting recovery
|
||||
DEFAULT_RECOVERY_WAIT_TIME=10
|
||||
|
||||
# Command line parameters
|
||||
if [ $# -gt 0 ]
|
||||
then
|
||||
while :; do
|
||||
case $1 in
|
||||
-h|-\?|--help)
|
||||
usage
|
||||
exit
|
||||
;;
|
||||
-u|--undercloud)
|
||||
undercloud=true
|
||||
shift
|
||||
;;
|
||||
-t|--test)
|
||||
test_sequence="$2"
|
||||
shift
|
||||
;;
|
||||
-r|--recover)
|
||||
recovery_sequence="$2"
|
||||
shift
|
||||
;;
|
||||
--)
|
||||
shift
|
||||
break
|
||||
;;
|
||||
-?*)
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
*)
|
||||
break
|
||||
esac
|
||||
|
||||
shift
|
||||
done
|
||||
else
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Populating overcloud elements if not on undercloud
|
||||
if [ "$undercloud" != true ]
|
||||
then
|
||||
echo -n "$(date) - Populationg overcloud elements..."
|
||||
OVERCLOUD_CORE_RESOURCES="galera redis rabbitmq"
|
||||
OVERCLOUD_RESOURCES=$(sudo pcs resource show | egrep '^ (C|[a-Z])' | sed 's/.* \[\(.*\)\]/\1/g' | sed 's/ \(.*\)(.*):.*/\1/g' | sort)
|
||||
OVERCLOUD_SYSTEMD_RESOURCES=$(sudo pcs config show | egrep "Resource:.*systemd"|grep -v "haproxy"|awk '{print $2}')
|
||||
echo "OK"
|
||||
fi
|
||||
|
||||
if [ -f "$test_sequence" ]
|
||||
then
|
||||
echo "$(date) - Test: $(grep '^#.*Test:' $test_sequence | sed 's/^#.*Test: //')"
|
||||
. $test_sequence
|
||||
else
|
||||
echo "No test file passed or unable to read test file."
|
||||
fi
|
||||
|
||||
if [ -f "$recovery_sequence" ]
|
||||
then
|
||||
echo "$(date) - Waiting $DEFAULT_RECOVERY_WAIT_TIME seconds to recover environment"
|
||||
sleep $DEFAULT_RECOVERY_WAIT_TIME
|
||||
|
||||
echo "$(date) - Recovery: $(grep '^#.*Recovery:' $recovery_sequence | sed 's/^#.*Recovery: //')"
|
||||
. $recovery_sequence
|
||||
else
|
||||
echo "No recovery file passed or unable to read recovery file."
|
||||
fi
|
||||
|
||||
echo "$(date) - End"
|
151
tools/ha-test-suite/include/functions
Executable file
151
tools/ha-test-suite/include/functions
Executable file
@ -0,0 +1,151 @@
|
||||
# Raoul Scarazzini (rasca@redhat.com)
|
||||
# This script provides a testing suite from TripleO/Directory OpenStack HA (so
|
||||
# with Pacemaker) environments functions to be used inside TripleO/Director
|
||||
# OpenStack HA environments
|
||||
|
||||
function usage {
|
||||
echo "Usage $0 -t <testfile> [-r <recover file>] [-u]
|
||||
-t, --test <testfile> Specify which file contains the test to run
|
||||
-r, --recover <recoverfile> Specify which file (if any) should be used for recovery
|
||||
-u, --undercloud Test will be performed on undercloud
|
||||
"
|
||||
}
|
||||
|
||||
function check_failed_actions {
|
||||
resource=$1
|
||||
|
||||
sudo pcs status | grep "Failed Actions:" &> /dev/null
|
||||
if [ $? -eq 0 ]
|
||||
then
|
||||
if [ "x$resource" == "x" ]
|
||||
then
|
||||
echo "Cluster has failed actions:"
|
||||
sudo pcs status | sed -n -e '/Failed Actions:/,/^$/p' | egrep 'OCF_|not running|unknown' | awk '{print $2}' | cut -f1 -d_ | sort |uniq
|
||||
exit 1
|
||||
else
|
||||
errors=$(sudo pcs status | sed -n -e '/Failed Actions:/,/^$/p' | grep -A1 $resource)
|
||||
if [ $? -eq 0 ]
|
||||
then
|
||||
echo "Resource $resource has failed actions:"
|
||||
echo $errors
|
||||
exit 1
|
||||
else
|
||||
echo "No failed actions for $resource."
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
else
|
||||
[ "x$resource" == "x" ] && echo "Cluster is OK." || echo "No failed actions for $resource."
|
||||
return 0
|
||||
fi
|
||||
}
|
||||
|
||||
function check_resources_process_status {
|
||||
for resource in $OVERCLOUD_RESOURCES
|
||||
do
|
||||
echo -n "$resource -> "
|
||||
|
||||
case $resource in
|
||||
ip-*) #ip_addr=$(pcs resource show $resource | grep Attributes | sed 's/.*ip=\(.*\) cidr.*/\1/g')
|
||||
ip_addr=$(echo $resource | sed 's/ip-//g')
|
||||
sudo ip a s | grep $ip_addr &> /dev/null
|
||||
;;
|
||||
rabbitmq) sudo /usr/sbin/rabbitmqctl cluster_status &> /dev/null
|
||||
;;
|
||||
redis) pidof /usr/bin/redis-server &> /dev/null
|
||||
;;
|
||||
galera) pidof /usr/libexec/mysqld &> /dev/null
|
||||
;;
|
||||
*cleanup*|delay) echo -n "no need to check if it's "
|
||||
;;
|
||||
*) systemctl is-active $resource &> /dev/null
|
||||
;;
|
||||
esac
|
||||
|
||||
[ $? -eq 0 ] && echo "active" || echo "inactive"
|
||||
|
||||
done
|
||||
}
|
||||
|
||||
function wait_resource_status {
|
||||
resource=$1
|
||||
status=$2
|
||||
i=1
|
||||
|
||||
while [ $i -lt $RESOURCE_CHANGE_STATUS_TIMEOUT ]
|
||||
do
|
||||
output=$(sudo pcs status resources | sed -n -e "/\(Clone\|Master\/Slave\) Set: .*\[$resource\]/,/^ [a-Z]/p" | head -n -1 | tail -n +2 | egrep -v "$status\:")
|
||||
if [ "x$output" == "x" ]
|
||||
then
|
||||
return 0
|
||||
break
|
||||
else
|
||||
echo -n "."
|
||||
sleep 1
|
||||
let "i++"
|
||||
fi
|
||||
done
|
||||
check_failed_actions
|
||||
exit 1
|
||||
}
|
||||
|
||||
function check_resource_status {
|
||||
resource=$1
|
||||
status=$2
|
||||
|
||||
output=$(sudo pcs status resources | sed -n -e "/\(Clone\|Master\/Slave\) Set: .*\[$resource\]/,/^ [a-Z]/p" | head -n -1 | tail -n +2 | egrep -v "$status\:")
|
||||
# Since we are checking a specific status, if we have output from above it
|
||||
# means that for some reason the resource is not in the state we are expecting
|
||||
[ "x$output" == "x" ] && return 0 || (check_failed_actions; exit 1)
|
||||
}
|
||||
|
||||
function wait_cluster_start {
|
||||
i=1
|
||||
while true; do
|
||||
[ $i -eq $RESOURCE_CHANGE_STATUS_TIMEOUT ] && break
|
||||
|
||||
# Check for failed actions
|
||||
sudo pcs status | egrep "Failed" &> /dev/null
|
||||
[ $? -eq 0 ] && break
|
||||
|
||||
# If we have stopped resources let's wait
|
||||
sudo pcs status | egrep "Stopped" &> /dev/null
|
||||
if [ $? -eq 0 ]
|
||||
then
|
||||
echo -n "."
|
||||
else
|
||||
echo "All cluster resources are started."
|
||||
return 0
|
||||
break
|
||||
fi
|
||||
sleep 1
|
||||
let "i++"
|
||||
done
|
||||
|
||||
# If we are here than we have problems: we hit timeout or we still have
|
||||
# stopped resources
|
||||
echo "Problems found. There are stopped or failed resources!"
|
||||
check_failed_actions
|
||||
exit 1
|
||||
}
|
||||
|
||||
function play_on_resources {
|
||||
action=$1
|
||||
resources=$2
|
||||
|
||||
for resource in $resources
|
||||
do
|
||||
echo -n "$(date) - Performing action $action on resource $resource "
|
||||
# Do the action on the resource
|
||||
sudo pcs resource $action $resource --wait $RESOURCE_CHANGE_STATUS_TIMEOUT
|
||||
if [ $? -ne 0 ]
|
||||
then
|
||||
echo "FAILURE!"
|
||||
check_failed_actions $resource
|
||||
exit 1
|
||||
else
|
||||
echo "OK"
|
||||
fi
|
||||
done
|
||||
return 0
|
||||
}
|
13
tools/ha-test-suite/recovery/recovery_entire-cluster
Normal file
13
tools/ha-test-suite/recovery/recovery_entire-cluster
Normal file
@ -0,0 +1,13 @@
|
||||
# Recovery: Enable all systemd and core resources, cleanup failed actions
|
||||
|
||||
echo "$(date) * Step 1: enable all the cluster resources"
|
||||
play_on_resources "enable" "$OVERCLOUD_RESOURCES"
|
||||
|
||||
echo "$(date) * Step 2: Cleaning up failed resources"
|
||||
sudo pcs status | sed -n -e '/Failed Actions:/,/^$/p' | egrep 'OCF_TIMEOUT|not running' | awk '{print $2}' | cut -f1 -d_ | sort | uniq | while read RES; do echo "Cleaning $RES"; sudo pcs resource cleanup $RES; done
|
||||
|
||||
echo "$(date) * Step 3: Waiting all resources to start"
|
||||
wait_cluster_start
|
||||
|
||||
echo "$(date) - List of cluster's failed actions:"
|
||||
check_failed_actions
|
27
tools/ha-test-suite/recovery/recovery_instance-creation
Normal file
27
tools/ha-test-suite/recovery/recovery_instance-creation
Normal file
@ -0,0 +1,27 @@
|
||||
# Recovery: Instance creation
|
||||
|
||||
STACKDIR=/home/stack
|
||||
|
||||
INSTANCE_NAME=cirros-1
|
||||
|
||||
source $STACKDIR/overcloudrc
|
||||
|
||||
floatingip=$(nova list | grep $INSTANCE_NAME | awk '{print $13}')
|
||||
floatingip_id=$(neutron floatingip-list | grep $floatingip | awk '{print $2}')
|
||||
port_id=$(neutron port-list | grep $floatingip | awk '{print $2}')
|
||||
|
||||
neutron floatingip-disassociate $floatingip_id $port_id
|
||||
neutron floatingip-delete $floatingip_id
|
||||
nova delete $INSTANCE_NAME
|
||||
projectid=$(openstack project list | awk '/admin/ {print $2}')
|
||||
glance --os-project-id=$projectid image-delete $(glance --os-project-id=$projectid image-list | grep CirrOS | awk '{print $2}')
|
||||
rm /tmp/cirros-0.3.4-x86_64-disk.img
|
||||
nova flavor-delete overcloud-instance-test-small-flavor
|
||||
neutron router-gateway-clear floating-router floating-network
|
||||
neutron router-interface-delete floating-router private-subnet
|
||||
neutron router-delete floating-router
|
||||
neutron security-group-delete pingandssh
|
||||
neutron subnet-delete private-subnet
|
||||
neutron subnet-delete floating-subnet
|
||||
neutron net-delete floating-network
|
||||
neutron net-delete private-network
|
@ -0,0 +1,7 @@
|
||||
# Recovery: Enable keystone via httpd and check for failed actions
|
||||
|
||||
echo "$(date) * Step 1: enable keystone resource via httpd"
|
||||
play_on_resources "enable" "httpd-clone"
|
||||
|
||||
echo "$(date) - List of cluster's failed actions:"
|
||||
check_failed_actions
|
10
tools/ha-test-suite/recovery/recovery_keystone-stop
Normal file
10
tools/ha-test-suite/recovery/recovery_keystone-stop
Normal file
@ -0,0 +1,10 @@
|
||||
# Recovery: Enable openstack-keystone and check for failed actions
|
||||
|
||||
echo "$(date) * Step 1: enable openstack-keystone resource"
|
||||
play_on_resources "enable" "openstack-keystone-clone"
|
||||
|
||||
echo "$(date) - Checking for Stopped resources:"
|
||||
wait_cluster_start
|
||||
|
||||
echo "$(date) - List of cluster's failed actions:"
|
||||
check_failed_actions
|
7
tools/ha-test-suite/recovery/recovery_master-slave
Normal file
7
tools/ha-test-suite/recovery/recovery_master-slave
Normal file
@ -0,0 +1,7 @@
|
||||
# Recovery: Enable master slave resources (galera and redis), all the resources should come up
|
||||
|
||||
echo "$(date) * Step 1: enable galera, redis and rabbitmq"
|
||||
play_on_resources "enable" "$OVERCLOUD_CORE_RESOURCES"
|
||||
|
||||
echo "$(date) - List of cluster's failed actions:"
|
||||
check_failed_actions
|
7
tools/ha-test-suite/recovery/recovery_mongo
Normal file
7
tools/ha-test-suite/recovery/recovery_mongo
Normal file
@ -0,0 +1,7 @@
|
||||
# Recovery: Enable mongo and check for failed actions
|
||||
|
||||
echo "$(date) * Step 1: enable mongo"
|
||||
play_on_resources "enable" "mongo"
|
||||
|
||||
echo "$(date) - List of cluster's failed actions:"
|
||||
check_failed_actions
|
13
tools/ha-test-suite/recovery/recovery_pacemaker-light
Normal file
13
tools/ha-test-suite/recovery/recovery_pacemaker-light
Normal file
@ -0,0 +1,13 @@
|
||||
# Recovery: Enable all systemd and core resources, cleanup failed actions
|
||||
|
||||
echo "$(date) * Step 1: enable core resources"
|
||||
play_on_resources "enable" "$OVERCLOUD_CORE_RESOURCES"
|
||||
|
||||
echo "$(date) * Step 2: enable all the systemd resources"
|
||||
play_on_resources "enable" "$OVERCLOUD_SYSTEMD_RESOURCES"
|
||||
|
||||
echo "$(date) * Step 3: Waiting all resources to start"
|
||||
wait_cluster_start
|
||||
|
||||
echo "$(date) - List of cluster's failed actions:"
|
||||
check_failed_actions
|
@ -0,0 +1,10 @@
|
||||
# Recovery: Start cluster again
|
||||
|
||||
echo "$(date) * Step 1: start the cluster"
|
||||
sudo pcs cluster start --all
|
||||
|
||||
echo "$(date) * Step 2: Waiting all resources to start"
|
||||
wait_cluster_start
|
||||
|
||||
echo "$(date) - List of cluster's failed actions:"
|
||||
check_failed_actions
|
3
tools/ha-test-suite/test/test_check-failed-actions
Normal file
3
tools/ha-test-suite/test/test_check-failed-actions
Normal file
@ -0,0 +1,3 @@
|
||||
# Test: Wait cluster start and look for failed actions
|
||||
echo "$(date) - Waiting for cluster start and checking for failed resources:"
|
||||
wait_cluster_start
|
112
tools/ha-test-suite/test/test_instance-creation
Normal file
112
tools/ha-test-suite/test/test_instance-creation
Normal file
@ -0,0 +1,112 @@
|
||||
# Test: Instance deployment
|
||||
|
||||
set -e
|
||||
|
||||
STACKDIR=/home/stack
|
||||
|
||||
source $STACKDIR/overcloudrc
|
||||
|
||||
# Load the environment with all the info for floating network
|
||||
source $STACKDIR/environment
|
||||
|
||||
TIMEOUT=30
|
||||
INSTANCE_NAME=cirros-1
|
||||
PRIVATE_NETWORK=10.1.1.0/24
|
||||
|
||||
# Gateway creation
|
||||
# When the environment is recovering from a previous test it can happen
|
||||
# that neutron is waiting to rejoin its cluster, preventing from creating
|
||||
# new stuff like router. We wait at least 300 seconds before giving up.
|
||||
set +e
|
||||
RUN=1
|
||||
while [ $RUN -lt $TIMEOUT ]
|
||||
do
|
||||
neutron router-create floating-router
|
||||
if [ $? -eq 0 ]
|
||||
then
|
||||
break
|
||||
else
|
||||
echo "Waiting..."
|
||||
let "RUN++"
|
||||
sleep 10
|
||||
fi
|
||||
done
|
||||
# If timeout was reached then we need to exit with error
|
||||
if [ $RUN -ge $TIMEOUT ]
|
||||
then
|
||||
echo "It was not possible to create the router, giving up."
|
||||
exit 1
|
||||
fi
|
||||
set -e
|
||||
|
||||
# Network and subnet creation
|
||||
neutron net-create floating-network --router:external=True --provider:physical_network $FLOATING_PHYSICAL_NET --provider:network_type flat
|
||||
neutron subnet-create --name floating-subnet --disable-dhcp --allocation-pool start=$FLOATING_RANGE_START,end=$FLOATING_RANGE_END --gateway $FLOATING_GW floating-network $FLOATING_SUBNET
|
||||
neutron net-create private-network
|
||||
neutron subnet-create private-network $PRIVATE_NETWORK --name private-subnet
|
||||
# Router configuration
|
||||
neutron router-interface-add floating-router private-subnet
|
||||
neutron router-gateway-set floating-router floating-network
|
||||
neutron security-group-create pingandssh
|
||||
securitygroup_id=$(neutron security-group-list | grep pingandssh | head -1 | awk '{print $2}')
|
||||
neutron security-group-rule-create --direction ingress --protocol tcp --port-range-min 22 --port-range-max 22 $securitygroup_id
|
||||
neutron security-group-rule-create --protocol icmp --direction ingress $securitygroup_id
|
||||
floatingip=$(neutron floatingip-create floating-network | grep floating_ip_address | awk '{print $4}')
|
||||
echo floatingip=$floatingip
|
||||
|
||||
#[stack@mrg-06 ~]$ neutron net-list
|
||||
#...
|
||||
#| 6fde7d2a-e2d9-4b0f-a982-b7cbc3244807 | private-network | 31a5ccd5-07bd-4103-a4a3-ab2c6d6148d7 10.1.1.0/24 |
|
||||
#...
|
||||
nova flavor-create --ephemeral 0 --is-public True test.small overcloud-instance-test-small-flavor 2048 20 1
|
||||
private_net_id=$(neutron net-list | grep private-network | awk '{print $2}')
|
||||
wget -O /tmp/cirros-0.3.4-x86_64-disk.img http://download.cirros-cloud.net/0.3.4/cirros-0.3.4-x86_64-disk.img
|
||||
projectid=$(openstack project list | awk '/admin/ {print $2}')
|
||||
glance --os-project-id=$projectid image-create --name CirrOS --container-format bare --disk-format raw --file /tmp/cirros-0.3.4-x86_64-disk.img
|
||||
nova boot --image CirrOS --flavor test.small --security-groups pingandssh --nic net-id=$private_net_id $INSTANCE_NAME
|
||||
|
||||
#[stack@mrg-06 ~]$ nova list
|
||||
#...
|
||||
#| eb29c1a1-c30e-4f8f-91ea-cec1fd38c088 | $INSTANCE_NAME | BUILD | spawning | NOSTATE | private-network=10.1.1.5 |
|
||||
#...
|
||||
sleep 5
|
||||
instance_ip=$(nova list | grep $INSTANCE_NAME | awk '{print $12}' | sed "s/private-network=//g")
|
||||
echo instance_ip=$instance_ip
|
||||
|
||||
#[stack@mrg-06 ~]$ neutron port-list
|
||||
#...
|
||||
#| 61ce215d-3dc7-4873-af73-342620cdc3b6 | | fa:16:3e:8d:8b:8d | {"subnet_id": "31a5ccd5-07bd-4103-a4a3-ab2c6d6148d7", "ip_address": "10.1.1.5"} |
|
||||
#...
|
||||
port_id=$(neutron port-list | grep $instance_ip | awk '{print $2}')
|
||||
echo port_id=$port_id
|
||||
|
||||
#[stack@mrg-06 ~]$ neutron floatingip-list
|
||||
#...
|
||||
#| 624f5256-ee89-438f-8335-904017e74a18 | | 10.16.144.77 | |
|
||||
#...
|
||||
floatingip_id=$(neutron floatingip-list | grep $floatingip | awk '{print $2}')
|
||||
echo floatingip_id=$floatingip_id
|
||||
neutron floatingip-associate $floatingip_id $port_id
|
||||
|
||||
echo "------------------------------------------------------------"
|
||||
echo "$(date) Instance will be available at the IP $floatingip"
|
||||
echo "------------------------------------------------------------"
|
||||
|
||||
set +e
|
||||
|
||||
COUNTER=1
|
||||
while [ $COUNTER -lt $TIMEOUT ]
|
||||
do
|
||||
ping -c1 $floatingip 2>&1 > /dev/null
|
||||
|
||||
if [ $? -eq 0 ]
|
||||
then
|
||||
echo "SUCCESS"
|
||||
break
|
||||
else
|
||||
echo -n "."
|
||||
fi
|
||||
let COUNTER=COUNTER+1
|
||||
done
|
||||
|
||||
[ $COUNTER -ge $TIMEOUT ] && (echo "FAILURE!"; exit 1)
|
40
tools/ha-test-suite/test/test_keystone-constraint-removal
Normal file
40
tools/ha-test-suite/test/test_keystone-constraint-removal
Normal file
@ -0,0 +1,40 @@
|
||||
# Test: Stop keystone resource (by stopping httpd), check no other resource is stopped
|
||||
|
||||
echo "$(date) * Step 1: disable keystone resource via httpd stop"
|
||||
play_on_resources "disable" "httpd"
|
||||
|
||||
echo "$(date) - List of cluster's failed actions:"
|
||||
check_failed_actions
|
||||
|
||||
echo "$(date) * Step 2: check resource status"
|
||||
# Define resource list without httpd
|
||||
OVERCLOUD_RESOURCES_NO_KEYSTONE="$(echo $OVERCLOUD_RESOURCES | sed 's/httpd/ /g')"
|
||||
# Define number of minutes to look for status
|
||||
MINUTES=10
|
||||
# Cycling for $MINUTES minutes polling every minute the status of the resources
|
||||
echo "$(date) - Cycling for 10 minutes polling every minute the status of the resources"
|
||||
i=0
|
||||
while [ $i -lt $MINUTES ]
|
||||
do
|
||||
# Wait a minute
|
||||
sleep 60
|
||||
echo "$(date) - Polling..."
|
||||
for resource in $OVERCLOUD_RESOURCES_NO_KEYSTONE
|
||||
do
|
||||
echo -n "$resource -> "
|
||||
# If the resource is a multi state like galera or redis, do a different check
|
||||
case $resource in
|
||||
"galera") check_resource_status "$resource" "Masters"
|
||||
;;
|
||||
"redis") check_resource_status "$resource" "(Masters|Slaves)"
|
||||
;;
|
||||
*) check_resource_status "$resource" "Started"
|
||||
;;
|
||||
esac
|
||||
[ $? -eq 0 ] && echo "OK" || (FAILURES=1; echo "Error!"; break)
|
||||
done
|
||||
let "i++"
|
||||
done
|
||||
|
||||
echo "$(date) - List of cluster's failed actions:"
|
||||
check_failed_actions
|
7
tools/ha-test-suite/test/test_keystone-stop
Normal file
7
tools/ha-test-suite/test/test_keystone-stop
Normal file
@ -0,0 +1,7 @@
|
||||
# Test: Stop openstack-keystone and look for failed actions
|
||||
|
||||
echo "$(date) * Step 1: disable openstack-keystone resource"
|
||||
play_on_resources "disable" "openstack-keystone-clone"
|
||||
|
||||
echo "$(date) - List of cluster's failed actions:"
|
||||
check_failed_actions
|
7
tools/ha-test-suite/test/test_master-slave
Normal file
7
tools/ha-test-suite/test/test_master-slave
Normal file
@ -0,0 +1,7 @@
|
||||
# Test: Stop master slave resources (galera and redis), all the resources should come down
|
||||
|
||||
echo "$(date) * Step 1: disable galera, redis and rabbitmq"
|
||||
play_on_resources "disable" "$OVERCLOUD_CORE_RESOURCES"
|
||||
|
||||
echo "$(date) - List of cluster's failed actions:"
|
||||
check_failed_actions
|
43
tools/ha-test-suite/test/test_mongo-with-aodh-ceilometer
Normal file
43
tools/ha-test-suite/test/test_mongo-with-aodh-ceilometer
Normal file
@ -0,0 +1,43 @@
|
||||
# Test: Stop mongo resource, check related systemd resources are fine
|
||||
|
||||
echo "$(date) * Step 1: disable mongo"
|
||||
play_on_resources "disable" "mongo"
|
||||
|
||||
echo "$(date) - List of cluster's failed actions:"
|
||||
check_failed_actions
|
||||
|
||||
echo "$(date) * Step 2: check resource status"
|
||||
# Define related resources
|
||||
OVERCLOUD_RESOURCES="openstack-aodh-evaluator openstack-aodh-listener openstack-aodh-notifier openstack-ceilometer-central.service openstack-ceilometer-collector.service openstack-ceilometer-notification.service"
|
||||
# Define number of minutes to look for status
|
||||
MINUTES=10
|
||||
# Cycling for $MINUTES minutes polling every minute the status of the resources
|
||||
echo "$(date) - Cycling for 10 minutes polling every minute the status of the resources"
|
||||
i=0
|
||||
while [ $i -lt $MINUTES ]
|
||||
do
|
||||
# Wait a minute
|
||||
sleep 60
|
||||
echo "$(date) - Polling..."
|
||||
for resource in $OVERCLOUD_RESOURCES
|
||||
do
|
||||
echo -n "$resource -> "
|
||||
# Check if the resource is active for the system
|
||||
systemctl is-active $resource
|
||||
if [ $? -ne 0 ]
|
||||
then
|
||||
# Show status of the resource
|
||||
echo "Error! Resource $resource is not active anymore."
|
||||
systemctl status $resource
|
||||
# Check in any case cluster's failed actions
|
||||
echo "$(date) - List of cluster's failed actions:"
|
||||
check_failed_actions
|
||||
# Now exit with an error
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
let "i++"
|
||||
done
|
||||
|
||||
# If we are here, test was successful
|
||||
echo "$(date) - Test was successful"
|
19
tools/ha-test-suite/test/test_pacemaker-light-a
Normal file
19
tools/ha-test-suite/test/test_pacemaker-light-a
Normal file
@ -0,0 +1,19 @@
|
||||
# Test: Stop every systemd resource, stop Galera and Rabbitmq, Start every systemd resource
|
||||
|
||||
echo "$(date) * Step 1: disable all the systemd resources"
|
||||
play_on_resources "disable" "$OVERCLOUD_SYSTEMD_RESOURCES"
|
||||
|
||||
echo "$(date) - List of cluster's failed actions:"
|
||||
check_failed_actions
|
||||
|
||||
echo "$(date) * Step 2: disable core services"
|
||||
play_on_resources "disable" "$OVERCLOUD_CORE_RESOURCES"
|
||||
|
||||
echo "$(date) - List of cluster's failed actions:"
|
||||
check_failed_actions
|
||||
|
||||
echo "$(date) * Step 3: enable each resource one by one and check the status"
|
||||
play_on_resources "enable" "$OVERCLOUD_SYSTEMD_RESOURCES"
|
||||
|
||||
echo "$(date) - List of cluster's failed actions:"
|
||||
check_failed_actions
|
19
tools/ha-test-suite/test/test_pacemaker-light-b
Normal file
19
tools/ha-test-suite/test/test_pacemaker-light-b
Normal file
@ -0,0 +1,19 @@
|
||||
# Test: Stop Galera and Rabbitmq, stop every systemd resource, Start every systemd resource
|
||||
|
||||
echo "$(date) * Step 1: disable core services"
|
||||
play_on_resources "disable" "$OVERCLOUD_CORE_RESOURCES"
|
||||
|
||||
echo "$(date) - List of cluster's failed actions:"
|
||||
check_failed_actions
|
||||
|
||||
echo "$(date) * Step 2: disable all the systemd resources"
|
||||
play_on_resources "disable" "$OVERCLOUD_SYSTEMD_RESOURCES"
|
||||
|
||||
echo "$(date) - List of cluster's failed actions:"
|
||||
check_failed_actions
|
||||
|
||||
echo "$(date) * Step 3: enable all the systemd resources"
|
||||
play_on_resources "enable" "$OVERCLOUD_SYSTEMD_RESOURCES"
|
||||
|
||||
echo "$(date) - List of cluster's failed actions:"
|
||||
check_failed_actions
|
22
tools/ha-test-suite/test/test_pacemaker-light-c
Normal file
22
tools/ha-test-suite/test/test_pacemaker-light-c
Normal file
@ -0,0 +1,22 @@
|
||||
# Test: Stop Galera and Rabbitmq, wait 20 minutes to see if something fails
|
||||
|
||||
echo "$(date) * Step 1: disable core services"
|
||||
play_on_resources "disable" "$OVERCLOUD_CORE_RESOURCES"
|
||||
|
||||
echo "$(date) - List of cluster's failed actions:"
|
||||
check_failed_actions
|
||||
|
||||
echo "$(date) * Step 2: poll every minute for twenty minutes the status of the resources"
|
||||
for i in $(seq 1 20)
|
||||
do
|
||||
check_failed_actions
|
||||
if [ $? -ne 0 ]
|
||||
then
|
||||
echo "Errors found, test is over."
|
||||
break
|
||||
fi
|
||||
sleep 60
|
||||
done
|
||||
|
||||
echo "$(date) - List of cluster's failed actions:"
|
||||
check_failed_actions
|
10
tools/ha-test-suite/test/test_processes-after-cluster-stop
Normal file
10
tools/ha-test-suite/test/test_processes-after-cluster-stop
Normal file
@ -0,0 +1,10 @@
|
||||
# Test: Check active processes after cluster stop
|
||||
|
||||
echo "$(date) * Step 1: checking actual process status"
|
||||
check_resources_process_status
|
||||
|
||||
echo "$(date) * Step 2: stopping cluster"
|
||||
sudo pcs cluster stop --all
|
||||
|
||||
echo "$(date) * Step 3: checking actual process status"
|
||||
check_resources_process_status
|
Loading…
x
Reference in New Issue
Block a user