Integrate ha-test-suite into the repo

This commit puts the ha-test-suite under the tools directory instead of having it downloaded from the outside (originally it was available at the link https://github.com/rscarazz/tripleo-director-ha-test-suite. At code level now the suite is copied directly from the local path tools/ha-test-suite and the executable now is ha-test-suite.sh. Change-Id: I087bc28a0afa3ede9b2fb698892b8306f56790a2
2017-05-03 12:41:23 -04:00 · 2017-05-03 12:41:23 -04:00 · d9e9613a8b
commit d9e9613a8b
parent 98593f6628
22 changed files with 771 additions and 13 deletions
--- a/roles/validate-ha/tasks/main.yml
+++ b/roles/validate-ha/tasks/main.yml
@ -17,14 +17,13 @@
    "{{ working_dir }}/workarounds.sh"
  when: apply_workarounds

- name: Get overcloud-ha-test-suite on undercloud and controllers
+- name: Copy ha-test-suite on undercloud and controllers
  shell: >
-    rm -rf tripleo-director-ha-test-suite;
-    git clone https://github.com/rscarazz/tripleo-director-ha-test-suite/ tripleo-director-ha-test-suite;
-  delegate_to: "{{ item }}"
+    /usr/bin/rsync --delay-updates -F --compress --archive -e 'ssh -F {{ local_working_dir }}/ssh.config.ansible' {{ local_working_dir }}/tripleo-quickstart-utils/tools/ha-test-suite  {{ item }}:
+  delegate_to: "localhost"
  with_items:
-    - "undercloud"
    - "{{ groups['controller'] }}"
+    - "undercloud"

 - name: Include test sequence depending on release
  include_vars:
@ -39,7 +38,7 @@
    - name: HA test - Failed actions (overcloud)
      delegate_to: overcloud-controller-0
      shell: >
-        {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/TD-ha-test-suite.sh -t {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/test/test_check-failed-actions
+        {{ overcloud_working_dir }}/ha-test-suite/ha-test-suite.sh -t {{ overcloud_working_dir }}/ha-test-suite/test/test_check-failed-actions
      register: test_ha_failed_actions_cmd
  always:
    - name: copy stdout test result to undercloud and check command
@ -54,7 +53,7 @@
    - name: HA test - Master/Slave core resource stop and start (overcloud)
      delegate_to: overcloud-controller-0
      shell: >
-        {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/TD-ha-test-suite.sh -t {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/test/test_master-slave -r {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/recovery/recovery_master-slave
+        {{ overcloud_working_dir }}/ha-test-suite/ha-test-suite.sh -t {{ overcloud_working_dir }}/ha-test-suite/test/test_master-slave -r {{ overcloud_working_dir }}/ha-test-suite/recovery/recovery_master-slave
      register: test_ha_master_slave_cmd
  always:
    - name: copy stdout test result to undercloud and check command
@ -69,7 +68,7 @@
    - name: HA test Keystone stop (overcloud)
      delegate_to: overcloud-controller-0
      shell: >
-        {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/TD-ha-test-suite.sh -t {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/test/test_keystone-constraint-removal -r {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/recovery/recovery_keystone-constraint-removal
+        {{ overcloud_working_dir }}/ha-test-suite/ha-test-suite.sh -t {{ overcloud_working_dir }}/ha-test-suite/test/test_keystone-constraint-removal -r {{ overcloud_working_dir }}/ha-test-suite/recovery/recovery_keystone-constraint-removal
      register: test_ha_keystone_stop_cmd
  always:
    - name: copy stdout test result to undercloud and check command
@ -84,7 +83,7 @@
    - name: HA test Keystone removal (overcloud)
      delegate_to: overcloud-controller-0
      shell: >
-        {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/TD-ha-test-suite.sh -t {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/test/test_keystone-constraint-removal -r {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/recovery/recovery_keystone-constraint-removal
+        {{ overcloud_working_dir }}/ha-test-suite/ha-test-suite.sh -t {{ overcloud_working_dir }}/ha-test-suite/test/test_keystone-constraint-removal -r {{ overcloud_working_dir }}/ha-test-suite/recovery/recovery_keystone-constraint-removal
      register: test_ha_keystone_constraint_removal_cmd
  always:
    - name: copy stdout test result to undercloud and check command
@ -99,7 +98,7 @@
    - name: HA test NG A (overcloud)
      delegate_to: overcloud-controller-0
      shell: >
-        {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/TD-ha-test-suite.sh -t {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/test/test_pacemaker-light-a -r {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/recovery/recovery_pacemaker-light
+        {{ overcloud_working_dir }}/ha-test-suite/ha-test-suite.sh -t {{ overcloud_working_dir }}/ha-test-suite/test/test_pacemaker-light-a -r {{ overcloud_working_dir }}/ha-test-suite/recovery/recovery_pacemaker-light
      register: test_ha_ng_a_cmd
  always:
    - name: copy stdout test result to undercloud and check command
@ -114,7 +113,7 @@
    - name: HA test NG B (overcloud)
      delegate_to: overcloud-controller-0
      shell: >
-        {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/TD-ha-test-suite.sh -t {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/test/test_pacemaker-light-b -r {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/recovery/recovery_pacemaker-light
+        {{ overcloud_working_dir }}/ha-test-suite/ha-test-suite.sh -t {{ overcloud_working_dir }}/ha-test-suite/test/test_pacemaker-light-b -r {{ overcloud_working_dir }}/ha-test-suite/recovery/recovery_pacemaker-light
      register: test_ha_ng_b_cmd
  always:
    - name: copy stdout test result to undercloud and check command
@ -129,7 +128,7 @@
    - name: HA test NG C (overcloud)
      delegate_to: overcloud-controller-0
      shell: >
-        {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/TD-ha-test-suite.sh -t {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/test/test_pacemaker-light-c -r {{ overcloud_working_dir }}/tripleo-director-ha-test-suite/recovery/recovery_pacemaker-light
+        {{ overcloud_working_dir }}/ha-test-suite/ha-test-suite.sh -t {{ overcloud_working_dir }}/ha-test-suite/test/test_pacemaker-light-c -r {{ overcloud_working_dir }}/ha-test-suite/recovery/recovery_pacemaker-light
      register: test_ha_ng_c_cmd
  always:
    - name: copy stdout test result to undercloud and check command
@ -143,7 +142,7 @@
 - block:
    - name: HA Test instance deploy on the overcloud (undercloud)
      shell: >
-        {{ working_dir }}/tripleo-director-ha-test-suite/TD-ha-test-suite.sh -t {{ working_dir }}/tripleo-director-ha-test-suite/test/test_instance-creation -r {{ working_dir }}/tripleo-director-ha-test-suite/recovery/recovery_instance-creation -u
+        {{ working_dir }}/ha-test-suite/ha-test-suite.sh -t {{ working_dir }}/ha-test-suite/test/test_instance-creation -r {{ working_dir }}/ha-test-suite/recovery/recovery_instance-creation -u
      register: test_ha_instance_cmd
  always:
    - name: copy stdout test result to undercloud and check command
--- a/tools/ha-test-suite/README.md
+++ b/tools/ha-test-suite/README.md
@ -0,0 +1,145 @@
+# OpenStack TripleO HA Test Suite
+
+This project is a modular and a customizable test suite to be applied in an
+Overcloud OpenStack environment deployed via TripleO upstream or Red Hat
+OpenStack Director (OSPd).
+
+## Usage
+
+The script needs at least a test file (-t) which must contain the sequence of
+the operations to be done.  A recovery file (-r), with the sequence of the
+operations needed to recovery the environment can also be passed. So a typical
+invocation will be something like this:
+
+```console
+[heat-admin@overcloud-controller-0 overcloud-ha-test-suite]$ ./overcloud-ha-test-suite.sh -t test/test_keystone-constraint-removal -r recovery/recovery_keystone-constraint-removal
+Fri May 20 15:27:19 UTC 2016 - Populationg overcloud elements...OK
+Fri May 20 15:27:22 UTC 2016 - Test: Stop keystone resource (by stopping httpd), check no other resource  is stopped
+Fri May 20 15:27:22 UTC 2016 * Step 1: disable keystone resource via httpd stop
+Fri May 20 15:27:22 UTC 2016 - Performing action disable on resource httpd ..OK
+Fri May 20 15:27:26 UTC 2016 - List of cluster's failed actions:
+Cluster is OK.
+Fri May 20 15:27:29 UTC 2016 * Step 2: check resource status
+Fri May 20 15:27:29 UTC 2016 - Cycling for 10 minutes polling every minute the status of the resources
+Fri May 20 15:28:29 UTC 2016 - Polling...
+delay -> OK
+galera -> OK
+...
+...
+openstack-sahara-engine -> OK
+rabbitmq -> OK
+redis -> OK
+Fri May 20 15:41:00 UTC 2016 - List of cluster's failed actions:
+Cluster is OK.
+Fri May 20 15:41:03 UTC 2016 - Waiting 10 seconds to recover environment
+Fri May 20 15:41:13 UTC 2016 - Recovery: Enable keystone via httpd and check for failed actions
+Fri May 20 15:41:13 UTC 2016 * Step 1: enable keystone resource via httpd
+Fri May 20 15:41:13 UTC 2016 - Performing action enable on resource httpd-clone OK
+Fri May 20 15:41:15 UTC 2016 - List of cluster's failed actions:
+Cluster is OK.
+Fri May 20 15:41:17 UTC 2016 - End
+```
+
+The exit status will depend on the result of the operations. If a disable
+operation fails, if failed actions will appear, if recovery does not ends with
+success exit status will not be 0.
+
+## Test and recoveries
+
+Test and recovery are bash script portions that are
+included inside the main script. Some functions and variables are available to
+help on recurring operations.  These functions are listed here:
+
+- **check_failed_actions**: will print failed actions and return error in case
+  some of them are present;
+- **check_resources_process_status**: will check for the process status of the
+  resources on the system (not in the cluster), i.e. will check if there is a
+  process for mysql daemon;
+- **wait_resource_status**: will wail until a default timeout
+  ($RESOURCE_CHANGE_STATUS_TIMEOUT) for a resource to reach a status;
+- **check_resource_status**: will check a resource status, i.e. if you want to
+  check if httpd resource is started;
+- **wait_cluster_start**: will wait the until a timeout
+  ($RESOURCE_CHANGE_STATUS_TIMEOUT) to be started, specifically will wait for
+  all resources to be in state "Started";
+- **play_on_resources**: will set the status of a resource;
+
+The variables are:
+
+- **OVERCLOUD_CORE_RESOURCES**: which are galera and rabbitmq
+- **OVERCLOUD_RESOURCES**: which are *all* the resources
+- **OVERCLOUD_SYSTEMD_RESOURCES**: which are the resources managed via systemd
+  by pacemaker;
+
+And can be used in combination to wrote test and recovery files.
+
+### Test file contents
+
+A typical test file, say test/test_keystone-constraint-removal, will contain
+something like this:
+
+```bash
+# Test: Stop keystone resource (by stopping httpd), check no other resource is stopped
+
+echo "$(date) * Step 1: disable keystone resource via httpd stop"
+play_on_resources "disable" "httpd"
+
+echo "$(date) - List of cluster's failed actions:"
+check_failed_actions
+
+echo "$(date) * Step 2: check resource status"
+# Define resource list without httpd
+OVERCLOUD_RESOURCES_NO_KEYSTONE="$(echo $OVERCLOUD_RESOURCES | sed 's/httpd/ /g')"
+# Define number of minutes to look for status
+MINUTES=10
+# Cycling for $MINUTES minutes polling every minute the status of the resources
+echo "$(date) - Cycling for 10 minutes polling every minute the status of the resources"
+i=0
+while [ $i -lt $MINUTES ]
+ do
+  # Wait a minute
+  sleep 60
+  echo "$(date) - Polling..."
+  for resource in $OVERCLOUD_RESOURCES_NO_KEYSTONE
+   do
+    echo -n "$resource -> "
+    check_resource_status "$resource" "Started"
+    [ $? -eq 0 ] && echo "OK" || (FAILURES=1; echo "Error!")
+   done
+  let "i++"
+ done
+
+echo "$(date) - List of cluster's failed actions:"
+check_failed_actions
+```
+
+Code is commented and should be self explaining, but in short:
+- the first commented line, after "# Test: " is read as test title;
+- using play_on_resources it disables httpd resource;
+- it checks for failed actions;
+- it defines a list of variable named OVERCLOUD_RESOURCES_NO_KEYSTONE containing
+  all the variable but httpd;
+- it cycles for 10 minutes, polling every minute the status of all the
+  resources;
+
+If any of these steps for some reason fails, then the overall test will be
+considered failed and the exit status will not be 0.
+
+### Recovery file contents
+
+A typical recovery file, say recovery/recovery_keystone-constraint-removal,
+will contain something like this:
+
+```bash
+# Recovery: Enable keystone via httpd and check for failed actions
+
+echo "$(date) * Step 1: enable keystone resource via httpd"
+play_on_resources "enable" "httpd-clone"
+
+echo "$(date) - List of cluster's failed actions:" check_failed_actions
+```
+
+Again:
+- the first commented line, after "# Recovery: " is read as recovery title;
+- using play_on_resources it enables httpd resource;
+- it checks for failed actions;
--- a/tools/ha-test-suite/ha-test-suite.sh
+++ b/tools/ha-test-suite/ha-test-suite.sh
@ -0,0 +1,87 @@
+#!/bin/bash
+
+# Raoul Scarazzini (rasca@redhat.com)
+# This script provides a testing suite for TripleO/Director OpenStack HA (so with Pacemaker) environments
+
+# Define main workdir
+WORKDIR=$(dirname $0)
+
+# Source function library.
+. $WORKDIR/include/functions
+
+# Fixed parameters
+# How much time wait in seconds for a resource to change status (i.e. from started to stopped)
+RESOURCE_CHANGE_STATUS_TIMEOUT=600
+# How much time wait in seconds before starting recovery
+DEFAULT_RECOVERY_WAIT_TIME=10
+
+# Command line parameters
+if [ $# -gt 0 ]
+ then
+  while :; do
+   case $1 in
+    -h|-\?|--help)
+        usage
+        exit
+        ;;
+    -u|--undercloud)
+        undercloud=true
+        shift
+        ;;
+    -t|--test)
+        test_sequence="$2"
+        shift
+        ;;
+    -r|--recover)
+        recovery_sequence="$2"
+        shift
+        ;;
+    --)
+        shift
+        break
+        ;;
+    -?*)
+        usage
+        exit 1
+        ;;
+    *)
+        break
+   esac
+
+   shift
+  done
+ else
+  usage
+  exit 1
+fi
+
+# Populating overcloud elements if not on undercloud
+if [ "$undercloud" != true ]
+ then
+  echo -n "$(date) - Populationg overcloud elements..."
+  OVERCLOUD_CORE_RESOURCES="galera redis rabbitmq"
+  OVERCLOUD_RESOURCES=$(sudo pcs resource show | egrep '^ (C|[a-Z])' | sed 's/.* \[\(.*\)\]/\1/g' | sed 's/ \(.*\)(.*):.*/\1/g' | sort)
+  OVERCLOUD_SYSTEMD_RESOURCES=$(sudo pcs config show | egrep "Resource:.*systemd"|grep -v "haproxy"|awk '{print $2}')
+  echo "OK"
+fi
+
+if [ -f "$test_sequence" ]
+ then
+  echo "$(date) - Test: $(grep '^#.*Test:' $test_sequence | sed 's/^#.*Test: //')"
+  . $test_sequence
+ else
+  echo "No test file passed or unable to read test file."
+fi
+
+if [ -f "$recovery_sequence" ]
+ then
+  echo "$(date) - Waiting $DEFAULT_RECOVERY_WAIT_TIME seconds to recover environment"
+  sleep $DEFAULT_RECOVERY_WAIT_TIME
+
+  echo "$(date) - Recovery: $(grep '^#.*Recovery:' $recovery_sequence | sed 's/^#.*Recovery: //')"
+  . $recovery_sequence
+ else
+  echo "No recovery file passed or unable to read recovery file."
+fi
+
+echo "$(date) - End"
--- a/tools/ha-test-suite/include/functions
+++ b/tools/ha-test-suite/include/functions
@ -0,0 +1,151 @@
+# Raoul Scarazzini (rasca@redhat.com)
+# This script provides a testing suite from TripleO/Directory OpenStack HA (so
+# with Pacemaker) environments functions to be used inside TripleO/Director
+# OpenStack HA environments
+
+function usage {
+  echo "Usage $0 -t <testfile> [-r <recover file>] [-u]
+-t, --test <testfile>		Specify which file contains the test to run
+-r, --recover <recoverfile>	Specify which file (if any) should be used for recovery
+-u, --undercloud		Test will be performed on undercloud
+"
+}
+
+function check_failed_actions {
+ resource=$1
+
+ sudo pcs status | grep "Failed Actions:" &> /dev/null
+ if [ $? -eq 0 ]
+  then
+   if [ "x$resource" == "x" ]
+    then
+     echo "Cluster has failed actions:"
+     sudo pcs status | sed -n -e '/Failed Actions:/,/^$/p' | egrep 'OCF_|not running|unknown' | awk '{print $2}' | cut -f1 -d_ | sort |uniq
+     exit 1
+    else
+     errors=$(sudo pcs status | sed -n -e '/Failed Actions:/,/^$/p' | grep -A1 $resource)
+     if [ $? -eq 0 ]
+      then
+       echo "Resource $resource has failed actions:"
+       echo $errors
+       exit 1
+      else
+       echo "No failed actions for $resource."
+       return 0
+     fi
+   fi
+  else
+   [ "x$resource" == "x" ] && echo "Cluster is OK." || echo "No failed actions for $resource."
+   return 0
+  fi
+}
+
+function check_resources_process_status {
+ for resource in $OVERCLOUD_RESOURCES
+  do
+   echo -n "$resource -> "
+
+   case $resource in
+   ip-*) #ip_addr=$(pcs resource show $resource | grep Attributes | sed 's/.*ip=\(.*\) cidr.*/\1/g')
+        ip_addr=$(echo $resource | sed 's/ip-//g')
+        sudo ip a s | grep $ip_addr &> /dev/null
+        ;;
+   rabbitmq) sudo /usr/sbin/rabbitmqctl cluster_status &> /dev/null
+             ;;
+   redis) pidof /usr/bin/redis-server &> /dev/null
+          ;;
+   galera) pidof /usr/libexec/mysqld &> /dev/null
+           ;;
+   *cleanup*|delay) echo  -n "no need to check if it's "
+                  ;;
+   *) systemctl is-active $resource &> /dev/null
+      ;;
+   esac
+
+   [ $? -eq 0 ] && echo "active" || echo "inactive"
+
+  done
+}
+
+function wait_resource_status {
+ resource=$1
+ status=$2
+ i=1
+
+ while [ $i -lt $RESOURCE_CHANGE_STATUS_TIMEOUT ]
+  do
+   output=$(sudo pcs status resources | sed -n -e "/\(Clone\|Master\/Slave\) Set: .*\[$resource\]/,/^ [a-Z]/p" | head -n -1 | tail -n +2 | egrep -v "$status\:")
+   if [ "x$output" == "x" ]
+    then
+     return 0
+     break
+    else
+     echo -n "."
+     sleep 1
+     let "i++"
+   fi
+  done
+ check_failed_actions
+ exit 1
+}
+
+function check_resource_status {
+ resource=$1
+ status=$2
+
+ output=$(sudo pcs status resources | sed -n -e "/\(Clone\|Master\/Slave\) Set: .*\[$resource\]/,/^ [a-Z]/p" | head -n -1 | tail -n +2 | egrep -v "$status\:")
+ # Since we are checking a specific status, if we have output from above it
+ # means that for some reason the resource is not in the state we are expecting
+ [ "x$output" == "x" ] && return 0 || (check_failed_actions; exit 1)
+}
+
+function wait_cluster_start {
+ i=1
+ while true; do
+  [ $i -eq $RESOURCE_CHANGE_STATUS_TIMEOUT ] && break
+
+  # Check for failed actions
+  sudo pcs status | egrep "Failed" &> /dev/null
+  [ $? -eq 0 ] && break
+
+  # If we have stopped resources let's wait
+  sudo pcs status | egrep "Stopped" &> /dev/null
+  if [ $? -eq 0 ]
+   then
+    echo -n "."
+   else
+    echo "All cluster resources are started."
+    return 0
+    break
+  fi
+  sleep 1
+  let "i++"
+ done
+
+ # If we are here than we have problems: we hit timeout or we still have
+ # stopped resources
+ echo "Problems found. There are stopped or failed resources!"
+ check_failed_actions
+ exit 1
+}
+
+function play_on_resources {
+ action=$1
+ resources=$2
+
+ for resource in $resources
+  do
+   echo -n "$(date) - Performing action $action on resource $resource "
+   # Do the action on the resource
+   sudo pcs resource $action $resource --wait $RESOURCE_CHANGE_STATUS_TIMEOUT
+   if [ $? -ne 0 ]
+    then
+     echo "FAILURE!"
+     check_failed_actions $resource
+     exit 1
+    else
+     echo "OK"
+   fi
+  done
+  return 0
+}
--- a/tools/ha-test-suite/recovery/recovery_entire-cluster
+++ b/tools/ha-test-suite/recovery/recovery_entire-cluster
@ -0,0 +1,13 @@
+# Recovery: Enable all systemd and core resources, cleanup failed actions
+
+echo "$(date) * Step 1: enable all the cluster resources"
+play_on_resources "enable" "$OVERCLOUD_RESOURCES"
+
+echo "$(date) * Step 2: Cleaning up failed resources"
+sudo pcs status | sed -n -e '/Failed Actions:/,/^$/p' | egrep 'OCF_TIMEOUT|not running' | awk '{print $2}' | cut -f1 -d_ | sort | uniq | while read RES; do echo "Cleaning $RES"; sudo pcs resource cleanup $RES; done
+
+echo "$(date) * Step 3: Waiting all resources to start"
+wait_cluster_start
+
+echo "$(date) - List of cluster's failed actions:"
+check_failed_actions
--- a/tools/ha-test-suite/recovery/recovery_instance-creation
+++ b/tools/ha-test-suite/recovery/recovery_instance-creation
@ -0,0 +1,27 @@
+# Recovery: Instance creation
+
+STACKDIR=/home/stack
+
+INSTANCE_NAME=cirros-1
+
+source $STACKDIR/overcloudrc
+
+floatingip=$(nova list | grep $INSTANCE_NAME | awk '{print $13}')
+floatingip_id=$(neutron floatingip-list | grep $floatingip | awk '{print $2}')
+port_id=$(neutron port-list | grep $floatingip | awk '{print $2}')
+
+neutron floatingip-disassociate $floatingip_id $port_id
+neutron floatingip-delete $floatingip_id
+nova delete $INSTANCE_NAME
+projectid=$(openstack project list | awk '/admin/ {print $2}')
+glance --os-project-id=$projectid image-delete $(glance --os-project-id=$projectid image-list | grep CirrOS | awk '{print $2}')
+rm /tmp/cirros-0.3.4-x86_64-disk.img
+nova flavor-delete overcloud-instance-test-small-flavor
+neutron router-gateway-clear floating-router floating-network
+neutron router-interface-delete floating-router private-subnet
+neutron router-delete floating-router
+neutron security-group-delete pingandssh
+neutron subnet-delete private-subnet
+neutron subnet-delete floating-subnet
+neutron net-delete floating-network
+neutron net-delete private-network
--- a/tools/ha-test-suite/recovery/recovery_keystone-constraint-removal
+++ b/tools/ha-test-suite/recovery/recovery_keystone-constraint-removal
@ -0,0 +1,7 @@
+# Recovery: Enable keystone via httpd and check for failed actions
+
+echo "$(date) * Step 1: enable keystone resource via httpd"
+play_on_resources "enable" "httpd-clone"
+
+echo "$(date) - List of cluster's failed actions:"
+check_failed_actions
--- a/tools/ha-test-suite/recovery/recovery_keystone-stop
+++ b/tools/ha-test-suite/recovery/recovery_keystone-stop
@ -0,0 +1,10 @@
+# Recovery: Enable openstack-keystone and check for failed actions
+
+echo "$(date) * Step 1: enable openstack-keystone resource"
+play_on_resources "enable" "openstack-keystone-clone"
+
+echo "$(date) - Checking for Stopped resources:"
+wait_cluster_start
+
+echo "$(date) - List of cluster's failed actions:"
+check_failed_actions
--- a/tools/ha-test-suite/recovery/recovery_master-slave
+++ b/tools/ha-test-suite/recovery/recovery_master-slave
@ -0,0 +1,7 @@
+# Recovery: Enable master slave resources (galera and redis), all the resources should come up
+
+echo "$(date) * Step 1: enable galera, redis and rabbitmq"
+play_on_resources "enable" "$OVERCLOUD_CORE_RESOURCES"
+
+echo "$(date) - List of cluster's failed actions:"
+check_failed_actions
--- a/tools/ha-test-suite/recovery/recovery_mongo
+++ b/tools/ha-test-suite/recovery/recovery_mongo
@ -0,0 +1,7 @@
+# Recovery: Enable mongo and check for failed actions
+
+echo "$(date) * Step 1: enable mongo"
+play_on_resources "enable" "mongo"
+
+echo "$(date) - List of cluster's failed actions:"
+check_failed_actions
--- a/tools/ha-test-suite/recovery/recovery_pacemaker-light
+++ b/tools/ha-test-suite/recovery/recovery_pacemaker-light
@ -0,0 +1,13 @@
+# Recovery: Enable all systemd and core resources, cleanup failed actions
+
+echo "$(date) * Step 1: enable core resources"
+play_on_resources "enable" "$OVERCLOUD_CORE_RESOURCES"
+
+echo "$(date) * Step 2: enable all the systemd resources"
+play_on_resources "enable" "$OVERCLOUD_SYSTEMD_RESOURCES"
+
+echo "$(date) * Step 3: Waiting all resources to start"
+wait_cluster_start
+
+echo "$(date) - List of cluster's failed actions:"
+check_failed_actions
--- a/tools/ha-test-suite/recovery/recovery_processes-after-cluster-stop
+++ b/tools/ha-test-suite/recovery/recovery_processes-after-cluster-stop
@ -0,0 +1,10 @@
+# Recovery: Start cluster again
+
+echo "$(date) * Step 1: start the cluster"
+sudo pcs cluster start --all
+
+echo "$(date) * Step 2: Waiting all resources to start"
+wait_cluster_start
+
+echo "$(date) - List of cluster's failed actions:"
+check_failed_actions
--- a/tools/ha-test-suite/test/test_check-failed-actions
+++ b/tools/ha-test-suite/test/test_check-failed-actions
@ -0,0 +1,3 @@
+# Test: Wait cluster start and look for failed actions
+echo "$(date) - Waiting for cluster start and checking for failed resources:"
+wait_cluster_start
--- a/tools/ha-test-suite/test/test_instance-creation
+++ b/tools/ha-test-suite/test/test_instance-creation
@ -0,0 +1,112 @@
+# Test: Instance deployment
+
+set -e
+
+STACKDIR=/home/stack
+
+source $STACKDIR/overcloudrc
+
+# Load the environment with all the info for floating network
+source $STACKDIR/environment
+
+TIMEOUT=30
+INSTANCE_NAME=cirros-1
+PRIVATE_NETWORK=10.1.1.0/24
+
+# Gateway creation
+# When the environment is recovering from a previous test it can happen
+# that neutron is waiting to rejoin its cluster, preventing from creating
+# new stuff like router. We wait at least 300 seconds before giving up.
+set +e
+RUN=1
+while [ $RUN -lt $TIMEOUT ]
+ do
+  neutron router-create floating-router
+  if [ $? -eq 0 ]
+   then
+    break
+   else
+    echo "Waiting..."
+    let "RUN++"
+    sleep 10
+  fi
+done
+# If timeout was reached then we need to exit with error
+if [ $RUN -ge $TIMEOUT ]
+ then
+  echo "It was not possible to create the router, giving up."
+  exit 1
+fi
+set -e
+
+# Network and subnet creation
+neutron net-create floating-network --router:external=True --provider:physical_network $FLOATING_PHYSICAL_NET --provider:network_type flat
+neutron subnet-create --name floating-subnet --disable-dhcp --allocation-pool start=$FLOATING_RANGE_START,end=$FLOATING_RANGE_END --gateway $FLOATING_GW floating-network $FLOATING_SUBNET
+neutron net-create private-network
+neutron subnet-create private-network $PRIVATE_NETWORK --name private-subnet
+# Router configuration
+neutron router-interface-add floating-router private-subnet
+neutron router-gateway-set floating-router floating-network
+neutron security-group-create pingandssh
+securitygroup_id=$(neutron security-group-list | grep pingandssh | head -1 | awk '{print $2}')
+neutron security-group-rule-create --direction ingress --protocol tcp --port-range-min 22 --port-range-max 22 $securitygroup_id
+neutron security-group-rule-create --protocol icmp --direction ingress $securitygroup_id
+floatingip=$(neutron floatingip-create floating-network | grep floating_ip_address | awk '{print $4}')
+echo floatingip=$floatingip
+
+#[stack@mrg-06 ~]$ neutron net-list
+#...
+#| 6fde7d2a-e2d9-4b0f-a982-b7cbc3244807 | private-network                                    | 31a5ccd5-07bd-4103-a4a3-ab2c6d6148d7 10.1.1.0/24      |
+#...
+nova flavor-create --ephemeral 0 --is-public True test.small overcloud-instance-test-small-flavor 2048 20 1
+private_net_id=$(neutron net-list | grep private-network | awk '{print $2}')
+wget -O /tmp/cirros-0.3.4-x86_64-disk.img http://download.cirros-cloud.net/0.3.4/cirros-0.3.4-x86_64-disk.img
+projectid=$(openstack project list | awk '/admin/ {print $2}')
+glance --os-project-id=$projectid image-create --name CirrOS --container-format bare --disk-format raw --file /tmp/cirros-0.3.4-x86_64-disk.img
+nova boot --image CirrOS --flavor test.small --security-groups pingandssh --nic net-id=$private_net_id $INSTANCE_NAME
+
+#[stack@mrg-06 ~]$ nova list
+#...
+#| eb29c1a1-c30e-4f8f-91ea-cec1fd38c088 | $INSTANCE_NAME | BUILD  | spawning   | NOSTATE     | private-network=10.1.1.5 |
+#...
+sleep 5
+instance_ip=$(nova list | grep $INSTANCE_NAME | awk '{print $12}' | sed "s/private-network=//g")
+echo instance_ip=$instance_ip
+
+#[stack@mrg-06 ~]$ neutron port-list
+#...
+#| 61ce215d-3dc7-4873-af73-342620cdc3b6 |                                                 | fa:16:3e:8d:8b:8d | {"subnet_id": "31a5ccd5-07bd-4103-a4a3-ab2c6d6148d7", "ip_address": "10.1.1.5"}      |
+#...
+port_id=$(neutron port-list | grep $instance_ip | awk '{print $2}')
+echo port_id=$port_id
+
+#[stack@mrg-06 ~]$ neutron floatingip-list
+#...
+#| 624f5256-ee89-438f-8335-904017e74a18 |                  | 10.16.144.77        |         |
+#...
+floatingip_id=$(neutron floatingip-list | grep $floatingip | awk '{print $2}')
+echo floatingip_id=$floatingip_id
+neutron floatingip-associate $floatingip_id $port_id
+
+echo "------------------------------------------------------------"
+echo "$(date) Instance will be available at the IP $floatingip"
+echo "------------------------------------------------------------"
+
+set +e
+
+COUNTER=1
+while [ $COUNTER -lt $TIMEOUT ]
+do
+ ping -c1 $floatingip 2>&1 > /dev/null
+
+ if [ $? -eq 0 ]
+  then
+   echo "SUCCESS"
+   break
+  else
+   echo -n "."
+ fi
+ let COUNTER=COUNTER+1
+done
+
+[ $COUNTER -ge $TIMEOUT ] && (echo "FAILURE!"; exit 1)
--- a/tools/ha-test-suite/test/test_keystone-constraint-removal
+++ b/tools/ha-test-suite/test/test_keystone-constraint-removal
@ -0,0 +1,40 @@
+# Test: Stop keystone resource (by stopping httpd), check no other resource is stopped
+
+echo "$(date) * Step 1: disable keystone resource via httpd stop"
+play_on_resources "disable" "httpd"
+   
+echo "$(date) - List of cluster's failed actions:"
+check_failed_actions
+
+echo "$(date) * Step 2: check resource status"
+# Define resource list without httpd
+OVERCLOUD_RESOURCES_NO_KEYSTONE="$(echo $OVERCLOUD_RESOURCES | sed 's/httpd/ /g')"
+# Define number of minutes to look for status
+MINUTES=10
+# Cycling for $MINUTES minutes polling every minute the status of the resources
+echo "$(date) - Cycling for 10 minutes polling every minute the status of the resources"
+i=0
+while [ $i -lt $MINUTES ]
+ do
+  # Wait a minute
+  sleep 60
+  echo "$(date) - Polling..."
+  for resource in $OVERCLOUD_RESOURCES_NO_KEYSTONE
+   do
+    echo -n "$resource -> "
+    # If the resource is a multi state like galera or redis, do a different check
+    case $resource in
+    "galera") check_resource_status "$resource" "Masters"
+              ;;
+    "redis") check_resource_status "$resource" "(Masters|Slaves)"
+             ;;
+    *) check_resource_status "$resource" "Started"
+       ;;
+    esac
+    [ $? -eq 0 ] && echo "OK" || (FAILURES=1; echo "Error!"; break)
+   done
+  let "i++"
+ done
+
+echo "$(date) - List of cluster's failed actions:"
+check_failed_actions
--- a/tools/ha-test-suite/test/test_keystone-stop
+++ b/tools/ha-test-suite/test/test_keystone-stop
@ -0,0 +1,7 @@
+# Test: Stop openstack-keystone and look for failed actions
+
+echo "$(date) * Step 1: disable openstack-keystone resource"
+play_on_resources "disable" "openstack-keystone-clone"
+   
+echo "$(date) - List of cluster's failed actions:"
+check_failed_actions
--- a/tools/ha-test-suite/test/test_master-slave
+++ b/tools/ha-test-suite/test/test_master-slave
@ -0,0 +1,7 @@
+# Test: Stop master slave resources (galera and redis), all the resources should come down
+
+echo "$(date) * Step 1: disable galera, redis and rabbitmq"
+play_on_resources "disable" "$OVERCLOUD_CORE_RESOURCES"
+   
+echo "$(date) - List of cluster's failed actions:"
+check_failed_actions
--- a/tools/ha-test-suite/test/test_mongo-with-aodh-ceilometer
+++ b/tools/ha-test-suite/test/test_mongo-with-aodh-ceilometer
@ -0,0 +1,43 @@
+# Test: Stop mongo resource, check related systemd resources are fine
+
+echo "$(date) * Step 1: disable mongo"
+play_on_resources "disable" "mongo"
+   
+echo "$(date) - List of cluster's failed actions:"
+check_failed_actions
+
+echo "$(date) * Step 2: check resource status"
+# Define related resources
+OVERCLOUD_RESOURCES="openstack-aodh-evaluator openstack-aodh-listener openstack-aodh-notifier openstack-ceilometer-central.service openstack-ceilometer-collector.service openstack-ceilometer-notification.service"
+# Define number of minutes to look for status
+MINUTES=10
+# Cycling for $MINUTES minutes polling every minute the status of the resources
+echo "$(date) - Cycling for 10 minutes polling every minute the status of the resources"
+i=0
+while [ $i -lt $MINUTES ]
+ do
+  # Wait a minute
+  sleep 60
+  echo "$(date) - Polling..."
+  for resource in $OVERCLOUD_RESOURCES
+   do
+    echo -n "$resource -> "
+    # Check if the resource is active for the system
+    systemctl is-active $resource
+    if [ $? -ne 0 ]
+     then
+      # Show status of the resource
+      echo "Error! Resource $resource is not active anymore."
+      systemctl status $resource
+      # Check in any case cluster's failed actions
+      echo "$(date) - List of cluster's failed actions:"
+      check_failed_actions
+      # Now exit with an error
+      exit 1
+    fi
+   done
+  let "i++"
+ done
+
+# If we are here, test was successful
+echo "$(date) - Test was successful"
--- a/tools/ha-test-suite/test/test_pacemaker-light-a
+++ b/tools/ha-test-suite/test/test_pacemaker-light-a
@ -0,0 +1,19 @@
+# Test: Stop every systemd resource, stop Galera and Rabbitmq, Start every systemd resource
+
+echo "$(date) * Step 1: disable all the systemd resources"
+play_on_resources "disable" "$OVERCLOUD_SYSTEMD_RESOURCES"
+   
+echo "$(date) - List of cluster's failed actions:"
+check_failed_actions
+   
+echo "$(date) * Step 2: disable core services"
+play_on_resources "disable" "$OVERCLOUD_CORE_RESOURCES"
+   
+echo "$(date) - List of cluster's failed actions:"
+check_failed_actions
+   
+echo "$(date) * Step 3: enable each resource one by one and check the status"
+play_on_resources "enable" "$OVERCLOUD_SYSTEMD_RESOURCES"
+   
+echo "$(date) - List of cluster's failed actions:"
+check_failed_actions
--- a/tools/ha-test-suite/test/test_pacemaker-light-b
+++ b/tools/ha-test-suite/test/test_pacemaker-light-b
@ -0,0 +1,19 @@
+# Test: Stop Galera and Rabbitmq, stop every systemd resource, Start every systemd resource
+
+echo "$(date) * Step 1: disable core services"
+play_on_resources "disable" "$OVERCLOUD_CORE_RESOURCES"
+   
+echo "$(date) - List of cluster's failed actions:"
+check_failed_actions
+   
+echo "$(date) * Step 2: disable all the systemd resources"
+play_on_resources "disable" "$OVERCLOUD_SYSTEMD_RESOURCES"
+   
+echo "$(date) - List of cluster's failed actions:"
+check_failed_actions
+   
+echo "$(date) * Step 3: enable all the systemd resources"
+play_on_resources "enable" "$OVERCLOUD_SYSTEMD_RESOURCES"
+   
+echo "$(date) - List of cluster's failed actions:"
+check_failed_actions
--- a/tools/ha-test-suite/test/test_pacemaker-light-c
+++ b/tools/ha-test-suite/test/test_pacemaker-light-c
@ -0,0 +1,22 @@
+# Test: Stop Galera and Rabbitmq, wait 20 minutes to see if something fails
+
+echo "$(date) * Step 1: disable core services"
+play_on_resources "disable" "$OVERCLOUD_CORE_RESOURCES"
+   
+echo "$(date) - List of cluster's failed actions:"
+check_failed_actions
+   
+echo "$(date) * Step 2: poll every minute for twenty minutes the status of the resources"
+for i in $(seq 1 20)
+ do
+  check_failed_actions
+  if [ $? -ne 0 ]
+   then
+    echo "Errors found, test is over."
+    break
+  fi
+  sleep 60
+ done
+   
+echo "$(date) - List of cluster's failed actions:"
+check_failed_actions
--- a/tools/ha-test-suite/test/test_processes-after-cluster-stop
+++ b/tools/ha-test-suite/test/test_processes-after-cluster-stop
@ -0,0 +1,10 @@
+# Test: Check active processes after cluster stop
+
+echo "$(date) * Step 1: checking actual process status"
+check_resources_process_status
+   
+echo "$(date) * Step 2: stopping cluster"
+sudo pcs cluster stop --all
+   
+echo "$(date) * Step 3: checking actual process status"
+check_resources_process_status