Fix AIO-DX Uncontrolled Swact ceph-mon failure
This change is the solution to resolve the scenario where after an uncontrolled swact due to killing one of the critical processes twice, the ceph-mon service doesn't start in the new active controller occasioning a new swact. It was created a flag to signalize a complete shutdown of ceph-mon. After an uncontrolled swact, the system verifies if the flag exists, and if so starts the ceph-mon service in the new active controller. Test Plan: PASS: System host-swact. PASS: Ceph recovery after rebooting the active controller. PASS: Ceph recovery after uncontrolled swact killing a critical process twice. PASS: Ceph recovery after mgmt network outage for a few minutes even when rebooting controllers. PASS: Ceph recovery after case of dead office recovery (DOR). PASS: Upgrade success from stx 7.0 to 8.0 in a duplex lab. Closes-bug: 2017133 Signed-off-by: Pedro Vinícius Silva da Cruz <pedro.silvadacruz@windriver.com> Change-Id: I6784ec76afa3e62ee14e8ca8f3d6c0212a9f6f3e
This commit is contained in:
parent
52c051e134
commit
09e29800cb
@ -49,6 +49,8 @@ if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" == "duplex" ]; the
|
||||
CEPH_LAST_ACTIVE_CONTROLLER_0_FLAG="${CEPH_MON_LIB_PATH}/.last_ceph_mon_active_controller_0"
|
||||
CEPH_LAST_ACTIVE_CONTROLLER_1_FLAG="${CEPH_MON_LIB_PATH}/.last_ceph_mon_active_controller_1"
|
||||
CEPH_LAST_ACTIVE_CONTROLLER_FLAG="${CEPH_MON_LIB_PATH}/.last_ceph_mon_active_${HOSTNAME/-/_}"
|
||||
|
||||
CEPH_MON_SHUTDOWN_COMPLETE="${CEPH_MON_LIB_PATH}/.ceph_mon_shutdown_complete"
|
||||
fi
|
||||
|
||||
BINDIR=/usr/bin
|
||||
@ -181,7 +183,13 @@ can_start_ceph_mon ()
|
||||
else
|
||||
local CEPH_OTHER_ACTIVE_CONTROLLER_FLAG="${CEPH_LAST_ACTIVE_CONTROLLER_0_FLAG}"
|
||||
fi
|
||||
|
||||
if [ -f "${CEPH_OTHER_ACTIVE_CONTROLLER_FLAG}" ]; then
|
||||
|
||||
if [ -f "${CEPH_MON_SHUTDOWN_COMPLETE}" ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Verify drbd-cephmon status
|
||||
for times in {9..0}; do
|
||||
is_drbd_cephmon_in_sync
|
||||
@ -284,6 +292,8 @@ start ()
|
||||
# Remove old flags
|
||||
rm -f "${CEPH_LAST_ACTIVE_CONTROLLER_0_FLAG}"
|
||||
rm -f "${CEPH_LAST_ACTIVE_CONTROLLER_1_FLAG}"
|
||||
rm -f "${CEPH_MON_SHUTDOWN_COMPLETE}"
|
||||
|
||||
# Create new flag
|
||||
touch "${CEPH_LAST_ACTIVE_CONTROLLER_FLAG}"
|
||||
fi
|
||||
@ -292,9 +302,15 @@ start ()
|
||||
|
||||
stop ()
|
||||
{
|
||||
local service="$1"
|
||||
|
||||
wlog "-" INFO "Ceph STOP $1 command received."
|
||||
with_service_lock "$1" ${CEPH_SCRIPT} stop $1
|
||||
wlog "-" INFO "Ceph STOP $1 command finished."
|
||||
|
||||
if [ "${service}" == "mon" ] && [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" == "duplex" ]; then
|
||||
touch "${CEPH_MON_SHUTDOWN_COMPLETE}"
|
||||
fi
|
||||
}
|
||||
|
||||
restart ()
|
||||
@ -304,7 +320,8 @@ restart ()
|
||||
exit 0
|
||||
fi
|
||||
wlog "-" INFO "Ceph RESTART $1 command received."
|
||||
with_service_lock "$1" ${CEPH_SCRIPT} restart $1
|
||||
stop "$1"
|
||||
start "$1"
|
||||
wlog "-" INFO "Ceph RESTART $1 command finished."
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user