Improve fault tolerance of MariaDB startup

* Changed podManagementPolicy to parallel in order to allow recovery from secondary or third master After rebooting the whole cluster on purpose or after a power failure a primary node the cluster can synchronize with is required. This is usually done automatically by selecting the node with the highest transaction id. The current implementation of the stateful set starts the nodes in sequence, preventing the start of further nodes if the process terminates with error state. Because of this, the cluster may not come up if the first or second node are not in primary state. * Elects first node started in primary state as bootstrap source. * Display warnings and runs mysqld with wsrep-recover on crashed nodes * Introduces FORCE_RECOVERY argument for crash recovery In case the primary selection failed, the cluster bootstrap process must be manually initiated from the most advanced node (highest committed transaction id). This information is available from the grastate.dat file in case of a clean shutdown. On crashed nodes an InnoDB recovery is required to get the last committed transaction id. start.sh will handle both cases and gives instructions on how to recover the cluster on a hard failure. If FORCE_RECOVERY was set to the name of a POD (mariadb-0, mariadb-1, ...), the bootstrap process will be initiated from the specified node. DocImpact Closes-Bug: #1716461 Change-Id: I96a8cb52124f64920a7d9cf21a8924ede78ebf7b
2018-01-21 22:06:37 +01:00 · 2018-01-21 22:06:37 +01:00 · d78f8e0901
commit d78f8e0901
parent 3917369bda
2 changed files with 125 additions and 27 deletions
--- a/mariadb/templates/bin/_start.sh.tpl
+++ b/mariadb/templates/bin/_start.sh.tpl
@ -19,7 +19,128 @@ set -xe
 # Bootstrap database
 CLUSTER_INIT_ARGS=""
-if [ ! -d /var/lib/mysql/mysql ]; then
+CLUSTER_CONFIG_PATH=/etc/mysql/conf.d/10-cluster-config.cnf
 function exitWithManualRecovery() {
    UUID=$(sed -e 's/^.*uuid:[\ ,\t]*//' -e 'tx' -e 'd' -e ':x' /var/lib/mysql/grastate.dat)
    SEQNO=$(sed -e 's/^.*seqno:[\ ,\t]*//' -e 'tx' -e 'd' -e ':x' /var/lib/mysql/grastate.dat)
    cat >/dev/stderr <<EOF
   **********************************************************
   *            MANUAL RECOVERY ACTION REQUIRED             *
   **********************************************************
 All cluster members are down and grastate.dat indicates that it's not
 safe to start the cluster from this node. If you see this message on
 all nodes, you have to do a manual recovery by following these steps:
    a) Find the node with the highest WSREP seq#:
 	POD ${PODNAME} uuid: ${UUID} seq: ${SEQNO}
   	If you see uuid 00000000-0000-0000-0000-000000000000 with
   	seq -1, the node crashed during DDL.
   	If seq is -1 you will find a DETECTED CRASH message
   	on your log. Check the output from InnoDB for the last
   	transaction id available.
    b) Set environment variable FORCE_RECOVERY=<NAME OF POD>
       to force bootstrapping from the specified node.
        Remember to remove FORCE_RECOVERY after your nodes
        are fully recovered! You may lose data otherwise.
 You can ignore this message and wait for the next restart if at
 least one node started without errors.
 EOF
    exit 1
 }
 # Construct cluster config
 MEMBERS=""
 for i in $(seq 1 ${MARIADB_REPLICAS}); do
    if [ "$i" -eq "1" ]; then
      NUM="0"
    else
      NUM="$(expr $i - 1)"
    fi
    CANDIDATE_POD="${SERVICE_NAME}-$NUM.$(hostname -d)"
    if [ "x${CANDIDATE_POD}" != "x${POD_NAME}.$(hostname -d)" ]; then
        if [ -n "${MEMBERS}" ]; then
            MEMBERS+=,
        fi
        MEMBERS+="${CANDIDATE_POD}:${WSREP_PORT}"
    fi
 done
 echo "Writing cluster config for ${POD_NAME} to ${CLUSTER_CONFIG_PATH}"
 cat > ${CLUSTER_CONFIG_PATH} <<EOF
 [mysqld]
 wsrep_cluster_address="gcomm://${MEMBERS}"
 wsrep_node_address=${POD_IP}
 wsrep_node_name=${POD_NAME}.$(hostname -d)
 EOF
 if [ ! -z "${FORCE_RECOVERY// }" ]; then
    	cat >/dev/stderr <<EOF
   **********************************************************
   *    !!!        FORCE_RECOVERY WARNING       !!!         *
   **********************************************************
 POD is starting with FORCE_RECOVERY defined. Remember to unset this
 variable after recovery! You may end up in recovering from a node
 with old data on a crash!
 You have been warned ;-)
   **********************************************************
   *               FORCE_RECOVERY WARNING                   *
   **********************************************************
 EOF
 fi
 if [ -d /var/lib/mysql/mysql -a -f /var/lib/mysql/grastate.dat ]; then
    # Node already initialized
    if [ "$(sed -e 's/^.*seqno:[\ ,\t]*//' -e 'tx' -e 'd' -e ':x' /var/lib/mysql/grastate.dat)" = "-1" ]; then
    	cat >/dev/stderr <<EOF
   **********************************************************
   *                   DETECTED CRASH                       *
   **********************************************************
 Trying to recover from a previous crash by running with wsrep-recover...
 EOF
 	mysqld --wsrep_cluster_address=gcomm:// --wsrep-recover
    fi
    echo "Check if we can find a cluster memeber."
    if ! mysql --defaults-file=/etc/mysql/admin_user.cnf \
        --connect-timeout 2 \
         -e 'select 1'; then
 	# No other nodes are running
    	if [ -z "${FORCE_RECOVERY// }" -a "$(sed -e 's/^.*safe_to_bootstrap:[\ ,\t]*//' -e 'tx' -e 'd' -e ':x' /var/lib/mysql/grastate.dat)" = "1" ]; then
            echo 'Bootstrapping from this node.'
            CLUSTER_INIT_ARGS=--wsrep-new-cluster
        elif [ "x${FORCE_RECOVERY}x" = "x${POD_NAME}x" ]; then
            echo 'Forced recovery bootstrap from this node.'
            CLUSTER_INIT_ARGS=--wsrep-new-cluster
            cp -f /var/lib/mysql/grastate.dat /var/lib/mysql/grastate.bak
    	    cat >/var/lib/mysql/grastate.dat <<EOF
 `grep -v 'safe_to_bootstrap:' /var/lib/mysql/grastate.bak`
 safe_to_bootstrap: 1
 EOF
 	    chown -R mysql:mysql /var/lib/mysql/grastate.dat
        else
    	    exitWithManualRecovery
    	fi
    fi
 elif [ ! -d /var/lib/mysql/mysql -o "x${FORCE_BOOTSTRAP}" = "xtrue" ]; then
    if [ "x${POD_NAME}" = "x${SERVICE_NAME}-0" ]; then
        echo No data found for pod 0
        if [ "x${FORCE_BOOTSTRAP}" = "xtrue" ]; then
@ -43,32 +164,6 @@ if [ ! -d /var/lib/mysql/mysql ]; then
    chown -R mysql:mysql /var/lib/mysql
 fi
 # Construct cluster config
 CLUSTER_CONFIG_PATH=/etc/mysql/conf.d/10-cluster-config.cnf
 MEMBERS=""
 for i in $(seq 1 ${MARIADB_REPLICAS}); do
    if [ "$i" -eq "1" ]; then
      NUM="0"
    else
      NUM="$(expr $i - 1)"
    fi
    CANDIDATE_POD="${SERVICE_NAME}-$NUM.$(hostname -d)"
    if [ "x${CANDIDATE_POD}" != "x${POD_NAME}.$(hostname -d)" ]; then
        if [ -n "${MEMBERS}" ]; then
            MEMBERS+=,
        fi
        MEMBERS+="${CANDIDATE_POD}:${WSREP_PORT}"
    fi
 done
 echo "Writing cluster config for ${POD_NAME} to ${CLUSTER_CONFIG_PATH}"
 cat >> ${CLUSTER_CONFIG_PATH} << EOF
 [mysqld]
 wsrep_cluster_address="gcomm://${MEMBERS}"
 wsrep_node_address=${POD_IP}
 wsrep_node_name=${POD_NAME}.$(hostname -d)
 EOF
 if [ "x${CLUSTER_BOOTSTRAP}" = "xtrue" ]; then
  mysql_install_db --user=mysql --datadir=/var/lib/mysql
--- a/mariadb/templates/statefulset.yaml
+++ b/mariadb/templates/statefulset.yaml
@ -27,6 +27,7 @@ metadata:
  name: mariadb
 spec:
  serviceName: "{{ tuple "oslo_db" "discovery" . | include "helm-toolkit.endpoints.hostname_short_endpoint_lookup" }}"
  podManagementPolicy: "Parallel"
  replicas: {{ .Values.pod.replicas.server }}
  template:
    metadata:
@ -74,6 +75,8 @@ spec:
                  fieldPath: metadata.name
            - name: FORCE_BOOTSTRAP
              value: {{ .Values.force_bootstrap | quote }}
            - name: FORCE_RECOVERY
              value: {{ .Values.force_recovey | quote }}
            - name: BOOTSTRAP_FILE
              value: {{ printf "/tmp/%s.sql" (randAlphaNum 8) }}
            - name: MARIADB_REPLICAS