Fix AIO-DX Uncontrolled Swact ceph-mon failure

This change is the solution to resolve the scenario where after
an uncontrolled swact due to killing one of the critical processes
twice, the ceph-mon service doesn't start in the new active
controller occasioning a new swact.

It was created a flag to signalize a complete shutdown of ceph-mon.
After an uncontrolled swact, the system verifies if the flag
exists, and if so starts the ceph-mon service in the new active
controller.

Test Plan:
    PASS: System host-swact.
    PASS: Ceph recovery after rebooting the active controller.
    PASS: Ceph recovery after uncontrolled swact killing a critical
          process twice.
    PASS: Ceph recovery after mgmt network outage for a few minutes
          even when rebooting controllers.
    PASS: Ceph recovery after case of dead office recovery (DOR).
    PASS: Upgrade success from stx 7.0 to 8.0 in a duplex lab.

Closes-bug: 2017133

Signed-off-by: Pedro Vinícius Silva da Cruz <pedro.silvadacruz@windriver.com>
Change-Id: I6784ec76afa3e62ee14e8ca8f3d6c0212a9f6f3e
This commit is contained in:
Pedro Vinícius Silva da Cruz 2023-04-20 08:36:22 -04:00
parent 52c051e134
commit 09e29800cb

View File

@ -49,6 +49,8 @@ if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" == "duplex" ]; the
CEPH_LAST_ACTIVE_CONTROLLER_0_FLAG="${CEPH_MON_LIB_PATH}/.last_ceph_mon_active_controller_0"
CEPH_LAST_ACTIVE_CONTROLLER_1_FLAG="${CEPH_MON_LIB_PATH}/.last_ceph_mon_active_controller_1"
CEPH_LAST_ACTIVE_CONTROLLER_FLAG="${CEPH_MON_LIB_PATH}/.last_ceph_mon_active_${HOSTNAME/-/_}"
CEPH_MON_SHUTDOWN_COMPLETE="${CEPH_MON_LIB_PATH}/.ceph_mon_shutdown_complete"
fi
BINDIR=/usr/bin
@ -181,7 +183,13 @@ can_start_ceph_mon ()
else
local CEPH_OTHER_ACTIVE_CONTROLLER_FLAG="${CEPH_LAST_ACTIVE_CONTROLLER_0_FLAG}"
fi
if [ -f "${CEPH_OTHER_ACTIVE_CONTROLLER_FLAG}" ]; then
if [ -f "${CEPH_MON_SHUTDOWN_COMPLETE}" ]; then
return 0
fi
# Verify drbd-cephmon status
for times in {9..0}; do
is_drbd_cephmon_in_sync
@ -284,6 +292,8 @@ start ()
# Remove old flags
rm -f "${CEPH_LAST_ACTIVE_CONTROLLER_0_FLAG}"
rm -f "${CEPH_LAST_ACTIVE_CONTROLLER_1_FLAG}"
rm -f "${CEPH_MON_SHUTDOWN_COMPLETE}"
# Create new flag
touch "${CEPH_LAST_ACTIVE_CONTROLLER_FLAG}"
fi
@ -292,9 +302,15 @@ start ()
stop ()
{
local service="$1"
wlog "-" INFO "Ceph STOP $1 command received."
with_service_lock "$1" ${CEPH_SCRIPT} stop $1
wlog "-" INFO "Ceph STOP $1 command finished."
if [ "${service}" == "mon" ] && [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" == "duplex" ]; then
touch "${CEPH_MON_SHUTDOWN_COMPLETE}"
fi
}
restart ()
@ -304,7 +320,8 @@ restart ()
exit 0
fi
wlog "-" INFO "Ceph RESTART $1 command received."
with_service_lock "$1" ${CEPH_SCRIPT} restart $1
stop "$1"
start "$1"
wlog "-" INFO "Ceph RESTART $1 command finished."
}