Rabbit: Ensure node has joined cluster on initial startup

This PS extends the rabbit startup locgic to ensure nodes have
actually joined the cluster on startup.

Change-Id: Ib876d9abd89209d0a7972983bdf4daacf5f8f582
Signed-off-by: Pete Birley <pete@port.direct>
This commit is contained in:
Pete Birley 2019-05-16 16:10:32 -05:00 committed by Pete Birley
parent 819cf51083
commit 9b5b901104
4 changed files with 75 additions and 33 deletions

View File

@ -18,4 +18,8 @@ limitations under the License.
set -e set -e
exec rabbitmqctl status if [ -f /run/rabbit-disable-readiness ]; then
exit 1
else
exec rabbitmqctl status
fi

View File

@ -29,10 +29,15 @@ function check_rabbit_node_health () {
rabbitmq-diagnostics node_health_check -n "${CLUSTER_SEED_NAME}" -t 10 &>/dev/null rabbitmq-diagnostics node_health_check -n "${CLUSTER_SEED_NAME}" -t 10 &>/dev/null
} }
function check_rabbit_node_ready () { get_node_name () {
TARGET_POD=$1 TARGET_POD=$1
POD_NAME_PREFIX="$(echo "${MY_POD_NAME}" | awk 'BEGIN{FS=OFS="-"}{NF--; print}')" POD_NAME_PREFIX="$(echo "${MY_POD_NAME}" | awk 'BEGIN{FS=OFS="-"}{NF--; print}')"
CLUSTER_SEED_NAME="$(echo "${RABBITMQ_NODENAME}" | awk -F "@${MY_POD_NAME}." "{ print \$1 \"@${POD_NAME_PREFIX}-${TARGET_POD}.\" \$2 }")" echo "${RABBITMQ_NODENAME}" | awk -F "@${MY_POD_NAME}." "{ print \$1 \"@${POD_NAME_PREFIX}-${TARGET_POD}.\" \$2 }"
}
function check_rabbit_node_ready () {
TARGET_POD=$1
CLUSTER_SEED_NAME="$(get_node_name ${TARGET_POD})"
CLUSTER_SEED_HOST="$(echo "${CLUSTER_SEED_NAME}" | awk -F '@' '{ print $NF }')" CLUSTER_SEED_HOST="$(echo "${CLUSTER_SEED_NAME}" | awk -F '@' '{ print $NF }')"
check_rabbit_node_health "${CLUSTER_SEED_NAME}" && \ check_rabbit_node_health "${CLUSTER_SEED_NAME}" && \
check_if_open "${CLUSTER_SEED_HOST}" "${PORT_HTTP}" && \ check_if_open "${CLUSTER_SEED_HOST}" "${PORT_HTTP}" && \
@ -56,7 +61,39 @@ if ! [ "${POD_INCREMENT}" -eq "0" ] && ! [ -d "/var/lib/rabbitmq/mnesia" ] ; the
fi fi
done done
done done
rm -fv /run/rabbit-disable-liveness-probe
function reset_rabbit () {
rabbitmqctl shutdown || true
rm -rf /var/lib/rabbitmq/*
exit 1
}
# Start RabbitMQ, but disable readiness from being reported so the pod is not
# marked as up prematurely.
touch /run/rabbit-disable-readiness
rabbitmq-server &
# Wait for server to start, and reset if it does not
END=$(($(date +%s) + 180))
while ! rabbitmqctl -q cluster_status; do
sleep 5
NOW=$(date +%s)
[ $NOW -gt $END ] && reset_rabbit
done
# Wait for server to join cluster, reset if it does not
POD_INCREMENT=$(echo "${MY_POD_NAME}" | awk -F '-' '{print $NF}')
END=$(($(date +%s) + 180))
while ! rabbitmqctl -l --node $(get_node_name 0) -q cluster_status | grep -q "$(get_node_name ${POD_INCREMENT})"; do
sleep 5
NOW=$(date +%s)
[ $NOW -gt $END ] && reset_rabbit
done
# Shutdown the inital server
rabbitmqctl shutdown
rm -fv /run/rabbit-disable-readiness /run/rabbit-disable-liveness-probe
fi fi
exec rabbitmq-server exec rabbitmq-server

View File

@ -16,7 +16,7 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/}} */}}
set -e set -ex
# Extract connection details # Extract connection details
RABBIT_HOSTNAME=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $2}' \ RABBIT_HOSTNAME=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $2}' \
@ -24,22 +24,30 @@ RABBIT_HOSTNAME=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $2}' \
RABBIT_PORT=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $2}' \ RABBIT_PORT=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $2}' \
| awk -F'[:/]' '{print $2}'` | awk -F'[:/]' '{print $2}'`
set +x
# Extract Admin User creadential # Extract Admin User creadential
RABBITMQ_ADMIN_USERNAME=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $1}' \ RABBITMQ_ADMIN_USERNAME=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $1}' \
| awk -F'[//:]' '{print $4}'` | awk -F'[//:]' '{print $4}'`
RABBITMQ_ADMIN_PASSWORD=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $1}' \ RABBITMQ_ADMIN_PASSWORD=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $1}' \
| awk -F'[//:]' '{print $5}'` | awk -F'[//:]' '{print $5}'`
set -x
function rabbit_check_node_count () { function rabbitmqadmin_authed () {
echo "Checking node count " set +x
NODES_IN_CLUSTER=$(rabbitmqadmin \ rabbitmqadmin \
--host="${RABBIT_HOSTNAME}" \ --host="${RABBIT_HOSTNAME}" \
--port="${RABBIT_PORT}" \ --port="${RABBIT_PORT}" \
--username="${RABBITMQ_ADMIN_USERNAME}" \ --username="${RABBITMQ_ADMIN_USERNAME}" \
--password="${RABBITMQ_ADMIN_PASSWORD}" \ --password="${RABBITMQ_ADMIN_PASSWORD}" \
list nodes -f bash | wc -w) $@
set -x
}
function rabbit_check_node_count () {
echo "Checking node count "
NODES_IN_CLUSTER=$(rabbitmqadmin_authed list nodes -f bash | wc -w)
if [ "$NODES_IN_CLUSTER" -eq "$RABBIT_REPLICA_COUNT" ]; then if [ "$NODES_IN_CLUSTER" -eq "$RABBIT_REPLICA_COUNT" ]; then
echo "Number of nodes in cluster match number of desired pods ($NODES_IN_CLUSTER)" echo "Number of nodes in cluster ($NODES_IN_CLUSTER) match number of desired pods ($NODES_IN_CLUSTER)"
else else
echo "Number of nodes in cluster ($NODES_IN_CLUSTER) does not match number of desired pods ($RABBIT_REPLICA_COUNT)" echo "Number of nodes in cluster ($NODES_IN_CLUSTER) does not match number of desired pods ($RABBIT_REPLICA_COUNT)"
exit 1 exit 1
@ -49,13 +57,9 @@ function rabbit_check_node_count () {
rabbit_check_node_count rabbit_check_node_count
function rabbit_find_partitions () { function rabbit_find_partitions () {
rabbitmqadmin \ NODE_INFO=$(mktemp)
--host="${RABBIT_HOSTNAME}" \ rabbitmqadmin_authed list nodes -f pretty_json | tee "${NODE_INFO}"
--port="${RABBIT_PORT}" \ cat "${NODE_INFO}" | python -c "
--username="${RABBITMQ_ADMIN_USERNAME}" \
--password="${RABBITMQ_ADMIN_PASSWORD}" \
list nodes -f raw_json | \
python -c "
import json, sys, traceback import json, sys, traceback
print('Checking cluster partitions') print('Checking cluster partitions')
obj=json.load(sys.stdin) obj=json.load(sys.stdin)
@ -66,31 +70,20 @@ for num, node in enumerate(obj):
raise Exception('cluster partition found: %s' % partition) raise Exception('cluster partition found: %s' % partition)
except KeyError: except KeyError:
print('Error: partition key not found for node %s' % node) print('Error: partition key not found for node %s' % node)
sys.exit(1)
print('No cluster partitions found') print('No cluster partitions found')
" "
rm -vf "${NODE_INFO}"
} }
rabbit_find_partitions rabbit_find_partitions
function rabbit_check_users_match () { function rabbit_check_users_match () {
echo "Checking users match on all nodes" echo "Checking users match on all nodes"
NODES=$(rabbitmqadmin \ NODES=$(rabbitmqadmin_authed list nodes -f bash)
--host="${RABBIT_HOSTNAME}" \
--port="${RABBIT_PORT}" \
--username="${RABBITMQ_ADMIN_USERNAME}" \
--password="${RABBITMQ_ADMIN_PASSWORD}" \
list nodes -f bash)
USER_LIST=$(mktemp --directory) USER_LIST=$(mktemp --directory)
echo "Found the following nodes: ${NODES}" echo "Found the following nodes: ${NODES}"
for NODE in ${NODES}; do for NODE in ${NODES}; do
echo "Checking Node: ${NODE#*@}" echo "Checking Node: ${NODE#*@}"
rabbitmqadmin \ rabbitmqadmin_authed list users -f bash > ${USER_LIST}/${NODE#*@}
--host=${NODE#*@} \
--port="${RABBIT_PORT}" \
--username="${RABBITMQ_ADMIN_USERNAME}" \
--password="${RABBITMQ_ADMIN_PASSWORD}" \
list users -f bash > ${USER_LIST}/${NODE#*@}
done done
cd ${USER_LIST}; diff -q --from-file $(ls ${USER_LIST}) cd ${USER_LIST}; diff -q --from-file $(ls ${USER_LIST})
echo "User lists match for all nodes" echo "User lists match for all nodes"

View File

@ -30,13 +30,21 @@ RABBITMQ_ADMIN_USERNAME=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $
RABBITMQ_ADMIN_PASSWORD=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $1}' \ RABBITMQ_ADMIN_PASSWORD=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $1}' \
| awk -F'[//:]' '{print $5}'` | awk -F'[//:]' '{print $5}'`
function active_rabbit_nodes () { set -ex
function rabbitmqadmin_authed () {
set +x
rabbitmqadmin \ rabbitmqadmin \
--host="${RABBIT_HOSTNAME}" \ --host="${RABBIT_HOSTNAME}" \
--port="${RABBIT_PORT}" \ --port="${RABBIT_PORT}" \
--username="${RABBITMQ_ADMIN_USERNAME}" \ --username="${RABBITMQ_ADMIN_USERNAME}" \
--password="${RABBITMQ_ADMIN_PASSWORD}" \ --password="${RABBITMQ_ADMIN_PASSWORD}" \
list nodes -f bash | wc -w $@
set -x
}
function active_rabbit_nodes () {
rabbitmqadmin_authed list nodes -f bash | wc -w
} }
until test "$(active_rabbit_nodes)" -ge "$RABBIT_REPLICA_COUNT"; do until test "$(active_rabbit_nodes)" -ge "$RABBIT_REPLICA_COUNT"; do