Rabbit: Ensure node has joined cluster on initial startup
This PS extends the rabbit startup locgic to ensure nodes have actually joined the cluster on startup. Change-Id: Ib876d9abd89209d0a7972983bdf4daacf5f8f582 Signed-off-by: Pete Birley <pete@port.direct>
This commit is contained in:
parent
819cf51083
commit
9b5b901104
@ -18,4 +18,8 @@ limitations under the License.
|
||||
|
||||
set -e
|
||||
|
||||
if [ -f /run/rabbit-disable-readiness ]; then
|
||||
exit 1
|
||||
else
|
||||
exec rabbitmqctl status
|
||||
fi
|
||||
|
@ -29,10 +29,15 @@ function check_rabbit_node_health () {
|
||||
rabbitmq-diagnostics node_health_check -n "${CLUSTER_SEED_NAME}" -t 10 &>/dev/null
|
||||
}
|
||||
|
||||
function check_rabbit_node_ready () {
|
||||
get_node_name () {
|
||||
TARGET_POD=$1
|
||||
POD_NAME_PREFIX="$(echo "${MY_POD_NAME}" | awk 'BEGIN{FS=OFS="-"}{NF--; print}')"
|
||||
CLUSTER_SEED_NAME="$(echo "${RABBITMQ_NODENAME}" | awk -F "@${MY_POD_NAME}." "{ print \$1 \"@${POD_NAME_PREFIX}-${TARGET_POD}.\" \$2 }")"
|
||||
echo "${RABBITMQ_NODENAME}" | awk -F "@${MY_POD_NAME}." "{ print \$1 \"@${POD_NAME_PREFIX}-${TARGET_POD}.\" \$2 }"
|
||||
}
|
||||
|
||||
function check_rabbit_node_ready () {
|
||||
TARGET_POD=$1
|
||||
CLUSTER_SEED_NAME="$(get_node_name ${TARGET_POD})"
|
||||
CLUSTER_SEED_HOST="$(echo "${CLUSTER_SEED_NAME}" | awk -F '@' '{ print $NF }')"
|
||||
check_rabbit_node_health "${CLUSTER_SEED_NAME}" && \
|
||||
check_if_open "${CLUSTER_SEED_HOST}" "${PORT_HTTP}" && \
|
||||
@ -56,7 +61,39 @@ if ! [ "${POD_INCREMENT}" -eq "0" ] && ! [ -d "/var/lib/rabbitmq/mnesia" ] ; the
|
||||
fi
|
||||
done
|
||||
done
|
||||
rm -fv /run/rabbit-disable-liveness-probe
|
||||
|
||||
function reset_rabbit () {
|
||||
rabbitmqctl shutdown || true
|
||||
rm -rf /var/lib/rabbitmq/*
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Start RabbitMQ, but disable readiness from being reported so the pod is not
|
||||
# marked as up prematurely.
|
||||
touch /run/rabbit-disable-readiness
|
||||
rabbitmq-server &
|
||||
|
||||
# Wait for server to start, and reset if it does not
|
||||
END=$(($(date +%s) + 180))
|
||||
while ! rabbitmqctl -q cluster_status; do
|
||||
sleep 5
|
||||
NOW=$(date +%s)
|
||||
[ $NOW -gt $END ] && reset_rabbit
|
||||
done
|
||||
|
||||
# Wait for server to join cluster, reset if it does not
|
||||
POD_INCREMENT=$(echo "${MY_POD_NAME}" | awk -F '-' '{print $NF}')
|
||||
END=$(($(date +%s) + 180))
|
||||
while ! rabbitmqctl -l --node $(get_node_name 0) -q cluster_status | grep -q "$(get_node_name ${POD_INCREMENT})"; do
|
||||
sleep 5
|
||||
NOW=$(date +%s)
|
||||
[ $NOW -gt $END ] && reset_rabbit
|
||||
done
|
||||
|
||||
# Shutdown the inital server
|
||||
rabbitmqctl shutdown
|
||||
|
||||
rm -fv /run/rabbit-disable-readiness /run/rabbit-disable-liveness-probe
|
||||
fi
|
||||
|
||||
exec rabbitmq-server
|
||||
|
@ -16,7 +16,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/}}
|
||||
|
||||
set -e
|
||||
set -ex
|
||||
|
||||
# Extract connection details
|
||||
RABBIT_HOSTNAME=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $2}' \
|
||||
@ -24,22 +24,30 @@ RABBIT_HOSTNAME=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $2}' \
|
||||
RABBIT_PORT=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $2}' \
|
||||
| awk -F'[:/]' '{print $2}'`
|
||||
|
||||
set +x
|
||||
# Extract Admin User creadential
|
||||
RABBITMQ_ADMIN_USERNAME=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $1}' \
|
||||
| awk -F'[//:]' '{print $4}'`
|
||||
RABBITMQ_ADMIN_PASSWORD=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $1}' \
|
||||
| awk -F'[//:]' '{print $5}'`
|
||||
set -x
|
||||
|
||||
function rabbit_check_node_count () {
|
||||
echo "Checking node count "
|
||||
NODES_IN_CLUSTER=$(rabbitmqadmin \
|
||||
function rabbitmqadmin_authed () {
|
||||
set +x
|
||||
rabbitmqadmin \
|
||||
--host="${RABBIT_HOSTNAME}" \
|
||||
--port="${RABBIT_PORT}" \
|
||||
--username="${RABBITMQ_ADMIN_USERNAME}" \
|
||||
--password="${RABBITMQ_ADMIN_PASSWORD}" \
|
||||
list nodes -f bash | wc -w)
|
||||
$@
|
||||
set -x
|
||||
}
|
||||
|
||||
function rabbit_check_node_count () {
|
||||
echo "Checking node count "
|
||||
NODES_IN_CLUSTER=$(rabbitmqadmin_authed list nodes -f bash | wc -w)
|
||||
if [ "$NODES_IN_CLUSTER" -eq "$RABBIT_REPLICA_COUNT" ]; then
|
||||
echo "Number of nodes in cluster match number of desired pods ($NODES_IN_CLUSTER)"
|
||||
echo "Number of nodes in cluster ($NODES_IN_CLUSTER) match number of desired pods ($NODES_IN_CLUSTER)"
|
||||
else
|
||||
echo "Number of nodes in cluster ($NODES_IN_CLUSTER) does not match number of desired pods ($RABBIT_REPLICA_COUNT)"
|
||||
exit 1
|
||||
@ -49,13 +57,9 @@ function rabbit_check_node_count () {
|
||||
rabbit_check_node_count
|
||||
|
||||
function rabbit_find_partitions () {
|
||||
rabbitmqadmin \
|
||||
--host="${RABBIT_HOSTNAME}" \
|
||||
--port="${RABBIT_PORT}" \
|
||||
--username="${RABBITMQ_ADMIN_USERNAME}" \
|
||||
--password="${RABBITMQ_ADMIN_PASSWORD}" \
|
||||
list nodes -f raw_json | \
|
||||
python -c "
|
||||
NODE_INFO=$(mktemp)
|
||||
rabbitmqadmin_authed list nodes -f pretty_json | tee "${NODE_INFO}"
|
||||
cat "${NODE_INFO}" | python -c "
|
||||
import json, sys, traceback
|
||||
print('Checking cluster partitions')
|
||||
obj=json.load(sys.stdin)
|
||||
@ -66,31 +70,20 @@ for num, node in enumerate(obj):
|
||||
raise Exception('cluster partition found: %s' % partition)
|
||||
except KeyError:
|
||||
print('Error: partition key not found for node %s' % node)
|
||||
sys.exit(1)
|
||||
print('No cluster partitions found')
|
||||
"
|
||||
rm -vf "${NODE_INFO}"
|
||||
}
|
||||
|
||||
rabbit_find_partitions
|
||||
|
||||
function rabbit_check_users_match () {
|
||||
echo "Checking users match on all nodes"
|
||||
NODES=$(rabbitmqadmin \
|
||||
--host="${RABBIT_HOSTNAME}" \
|
||||
--port="${RABBIT_PORT}" \
|
||||
--username="${RABBITMQ_ADMIN_USERNAME}" \
|
||||
--password="${RABBITMQ_ADMIN_PASSWORD}" \
|
||||
list nodes -f bash)
|
||||
NODES=$(rabbitmqadmin_authed list nodes -f bash)
|
||||
USER_LIST=$(mktemp --directory)
|
||||
echo "Found the following nodes: ${NODES}"
|
||||
for NODE in ${NODES}; do
|
||||
echo "Checking Node: ${NODE#*@}"
|
||||
rabbitmqadmin \
|
||||
--host=${NODE#*@} \
|
||||
--port="${RABBIT_PORT}" \
|
||||
--username="${RABBITMQ_ADMIN_USERNAME}" \
|
||||
--password="${RABBITMQ_ADMIN_PASSWORD}" \
|
||||
list users -f bash > ${USER_LIST}/${NODE#*@}
|
||||
rabbitmqadmin_authed list users -f bash > ${USER_LIST}/${NODE#*@}
|
||||
done
|
||||
cd ${USER_LIST}; diff -q --from-file $(ls ${USER_LIST})
|
||||
echo "User lists match for all nodes"
|
||||
|
@ -30,13 +30,21 @@ RABBITMQ_ADMIN_USERNAME=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $
|
||||
RABBITMQ_ADMIN_PASSWORD=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $1}' \
|
||||
| awk -F'[//:]' '{print $5}'`
|
||||
|
||||
function active_rabbit_nodes () {
|
||||
set -ex
|
||||
|
||||
function rabbitmqadmin_authed () {
|
||||
set +x
|
||||
rabbitmqadmin \
|
||||
--host="${RABBIT_HOSTNAME}" \
|
||||
--port="${RABBIT_PORT}" \
|
||||
--username="${RABBITMQ_ADMIN_USERNAME}" \
|
||||
--password="${RABBITMQ_ADMIN_PASSWORD}" \
|
||||
list nodes -f bash | wc -w
|
||||
$@
|
||||
set -x
|
||||
}
|
||||
|
||||
function active_rabbit_nodes () {
|
||||
rabbitmqadmin_authed list nodes -f bash | wc -w
|
||||
}
|
||||
|
||||
until test "$(active_rabbit_nodes)" -ge "$RABBIT_REPLICA_COUNT"; do
|
||||
|
Loading…
Reference in New Issue
Block a user