a0206d9626
Currently, if a multi-node cluster is shut down unexpectedly, RabbitMQ is not able to boot and sync with the other nodes. The purpose of this change is to add the possibility to use the rabbitmqctl force_boot command to recover RabbitMQ cluster from an unexpected shut down. Test plan: PASS: Shutdown and start a multi-node RabbitMQ cluster Regression: PASS: OpenStack can be applied successfully PASS: RabbitMQ nodes can join the RabbitMQ cluster Story: 2009784 Task: 44290 Ref: [0] https://www.rabbitmq.com/rabbitmqctl.8.html#force_boot Signed-off-by: Maik Catrinque <maik.wandercatrinqueandrade@windriver.com> Co-authored-by: Andrew Martins Carletti <Andrew.MartinsCarletti@windriver.com> Change-Id: I56e966ea64e8881ba436213f0c9e1cbe547098e3
101 lines
3.4 KiB
Smarty
101 lines
3.4 KiB
Smarty
#!/bin/bash
|
|
|
|
{{/*
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/}}
|
|
|
|
set -ex
|
|
|
|
function check_if_open () {
|
|
HOST=$1
|
|
PORT=$2
|
|
timeout 10 bash -c "true &>/dev/null </dev/tcp/${HOST}/${PORT}"
|
|
}
|
|
|
|
function check_rabbit_node_health () {
|
|
CLUSTER_SEED_NAME=$1
|
|
rabbitmq-diagnostics node_health_check -n "${CLUSTER_SEED_NAME}" -t 10 &>/dev/null
|
|
}
|
|
|
|
get_node_name () {
|
|
TARGET_POD=$1
|
|
POD_NAME_PREFIX="$(echo "${MY_POD_NAME}" | awk 'BEGIN{FS=OFS="-"}{NF--; print}')"
|
|
echo "${RABBITMQ_NODENAME}" | awk -F "@${MY_POD_NAME}." "{ print \$1 \"@${POD_NAME_PREFIX}-${TARGET_POD}.\" \$2 }"
|
|
}
|
|
|
|
function check_rabbit_node_ready () {
|
|
TARGET_POD=$1
|
|
CLUSTER_SEED_NAME="$(get_node_name ${TARGET_POD})"
|
|
CLUSTER_SEED_HOST="$(echo "${CLUSTER_SEED_NAME}" | awk -F '@' '{ print $NF }')"
|
|
check_rabbit_node_health "${CLUSTER_SEED_NAME}" && \
|
|
check_if_open "${CLUSTER_SEED_HOST}" "${PORT_HTTP}" && \
|
|
check_if_open "${CLUSTER_SEED_HOST}" "${PORT_AMPQ}" && \
|
|
check_if_open "${CLUSTER_SEED_HOST}" "${PORT_CLUSTERING}"
|
|
}
|
|
|
|
POD_INCREMENT=$(echo "${MY_POD_NAME}" | awk -F '-' '{print $NF}')
|
|
if ! [ "${POD_INCREMENT}" -eq "0" ] && ! [ -d "/var/lib/rabbitmq/mnesia" ] ; then
|
|
echo 'This is not the 1st rabbit pod & has not been initialised'
|
|
# disable liveness probe as it may take some time for the pod to come online.
|
|
touch /tmp/rabbit-disable-liveness-probe
|
|
POD_NAME_PREFIX="$(echo "${MY_POD_NAME}" | awk 'BEGIN{FS=OFS="-"}{NF--; print}')"
|
|
for TARGET_POD in $(seq 0 +1 $((POD_INCREMENT - 1 ))); do
|
|
END=$(($(date +%s) + 900))
|
|
while ! check_rabbit_node_ready "${TARGET_POD}"; do
|
|
sleep 5
|
|
if [ "$(date +%s)" -gt "$END" ]; then
|
|
echo "RabbitMQ pod ${TARGET_POD} not ready in time"
|
|
exit 1
|
|
fi
|
|
done
|
|
done
|
|
|
|
function reset_rabbit () {
|
|
rabbitmqctl shutdown || true
|
|
find /var/lib/rabbitmq/* ! -name 'definitions.json' ! -name '.erlang.cookie' -exec rm -rf {} +
|
|
exit 1
|
|
}
|
|
|
|
# Start RabbitMQ, but disable readiness from being reported so the pod is not
|
|
# marked as up prematurely.
|
|
touch /tmp/rabbit-disable-readiness
|
|
rabbitmq-server &
|
|
|
|
# Wait for server to start, and reset if it does not
|
|
END=$(($(date +%s) + 180))
|
|
while ! rabbitmqctl -q cluster_status; do
|
|
sleep 5
|
|
NOW=$(date +%s)
|
|
[ $NOW -gt $END ] && reset_rabbit
|
|
done
|
|
|
|
# Wait for server to join cluster, reset if it does not
|
|
POD_INCREMENT=$(echo "${MY_POD_NAME}" | awk -F '-' '{print $NF}')
|
|
END=$(($(date +%s) + 180))
|
|
while ! rabbitmqctl -l --node $(get_node_name 0) -q cluster_status | grep -q "$(get_node_name ${POD_INCREMENT})"; do
|
|
sleep 5
|
|
NOW=$(date +%s)
|
|
[ $NOW -gt $END ] && reset_rabbit
|
|
done
|
|
|
|
# Shutdown the inital server
|
|
rabbitmqctl shutdown
|
|
|
|
rm -fv /tmp/rabbit-disable-readiness /tmp/rabbit-disable-liveness-probe
|
|
fi
|
|
|
|
{{- if .Values.forceBoot.enabled }}
|
|
if [ "${POD_INCREMENT}" -eq "0" ] && [ -d "/var/lib/rabbitmq/mnesia/${RABBITMQ_NODENAME}" ]; then rabbitmqctl force_boot; fi
|
|
{{- end}}
|
|
exec rabbitmq-server
|