Adds support for 3 monitors on AIO-DX

Adding two fixed Ceph monitors for controllers when deploying an AIO-DX to improve HA. Includes: - Creates a new shell script to manage the fixed monitors - Creates a new patch to include the mon_data parameter in the mon.pp puppet manifest. - Creates the ceph-storage-network script that is used by the storage-networking SM service to stop Ceph services in case of a network outage. - Alters the script used by the ceph-mds pmon file to ceph-init-wrapper. - Adjusts the ceph-init-wrapper to accept commands from pmon service. - Adjusts the ceph-init-wrapper to accept the forcestop command. - Stopping Ceph services using ceph-init-wrapper, it is checked if the pid exists before trying. - Stopping ceph-mon service using ceph-init-wrapper, the ceph-mds is stopped right before to force a re-peering. - Starting ceph-mon service using ceph-init-wrapper, the ceph-mds is stopped right before to force a re-peering. - Starting ceph-mds, it is checked if the ceph-mon is operational. - The forcestop command uses a TERM signal first before attempting a KILL signal after 5 seconds. Test Plan: PASS: Fresh install AIO-DX and check 3 Ceph monitors are running. PASS: Fresh install all other setups and check if Ceph is working as expected. PASS: Reboots the standby controller and check if Ceph is still running. PASS: Reboots the active controller. Ceph will stop responding, but it will recover after both controllers are running. PASS: Verify Ceph is working after a DOR test with PODs writting to the cephfs and rbd pools. PASS: Verify Ceph is resilient to switch reboots Story: 2011122 Task: 50129 Change-Id: I18d7ab9da3303265da34bc13c8be4baa23c2a7be Signed-off-by: Hediberto C Silva <hediberto.cavalcantedasilva@windriver.com> Signed-off-by: Felipe Sanches Zanoni <Felipe.SanchesZanoni@windriver.com>
2024-04-02 16:15:54 -03:00 · 2024-04-02 16:15:54 -03:00 · d5a84a1dbc
commit d5a84a1dbc
parent 8e1e55284e
9 changed files with 447 additions and 60 deletions
--- a/ceph/ceph/debian/deb_folder/ceph-base.ceph.init
+++ b/ceph/ceph/debian/deb_folder/ceph-base.ceph.init
@ -623,7 +623,7 @@ stop_daemon() {
 	        if [ \$timeout -lt 0 ]; then
 	            break
 	        fi
-	        timeout-=1
+	        timeout=\$((timeout-1))
 	    fi
 	    cmd=\"kill $signal \$pid\"
 	    printf \"\$cmd...\"
@ -769,7 +769,7 @@ fi
 # When this is a AIO-DX pmon is monitoring ceph-mds process.
 # If ceph-mon is not running, ceph-mds will hang when starting.
 # Check if we are trying to bring up ceph-mds and ceph-mon is not ready yet
-if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" == "duplex" ]; then
+if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" != "simplex" ]; then
    if [ "${command}" = "start" -o "${command}" = "onestart" ]; then
        what_out=
        what_mds=
@ -873,6 +873,12 @@ for name in $what; do
    # conf file
    cmd="$cmd -c $conf"

+    # StarlingX:
+    # If this is AIO-DX, check if service is the fixed Ceph monitor and set the parameter --mon-data
+    if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" != "simplex" ] && [ "$type" == "mon" ] && [ "$id" == ${HOSTNAME} ]; then
+        cmd="$cmd --mon-data /var/lib/ceph/data/ceph-${HOSTNAME}"
+    fi
+
    if echo $name | grep -q ^osd; then
 	get_conf osd_data "/var/lib/ceph/osd/$cluster-$id" "osd data"
 	get_conf fs_path "$osd_data" "fs path"  # mount point defaults so osd data
@ -928,13 +934,15 @@ for name in $what; do

 	    [ -n "$TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES" ] && tcmalloc="TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES=$TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES"

-		# StarlingX: start processes in scope under slice system-ceph.slice
-		# so that ceph processes do not start under this script's callers cgroup
-		if [ "$type" = "osd" ]; then
-			cmd="systemd-run --scope --unit=ceph-${type}-${id} --slice=system-ceph $cmd"
-		else
-			cmd="systemd-run --scope --unit=ceph-${type} --slice=system-ceph $cmd"
-		fi
+	    # StarlingX: start processes in scope under slice system-ceph.slice
+	    # so that ceph processes do not start under this script's callers cgroup
+	    if [ "$type" = "osd" ]; then
+	        cmd="systemd-run --scope --unit=ceph-${type}-${id} --slice=system-ceph $cmd"
+	    elif [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" != "simplex" ] && [ "$type" == "mon" ] && [ "$id" == ${HOSTNAME} ]; then
+	        cmd="systemd-run --scope --unit=ceph-${type}-${id} --slice=system-ceph $cmd"
+	    else
+	        cmd="systemd-run --scope --unit=ceph-${type} --slice=system-ceph $cmd"
+	    fi

 	    # StarlingX: not running as ceph user/group
 	    cmd="$files $tcmalloc $wrap $cmd --cluster $cluster $runmode"
@ -988,9 +996,21 @@ for name in $what; do
 		fi
 	    fi

+	    echo Starting Ceph $name on $host...
+
+	    if [ $type = "mds" ]; then
+	        echo "Waiting for ceph-mon to respond before starting ceph-mds..."
+		execute_ceph_cmd CMD_OUTPUT $name "ceph fsid"
+	        if [ $? -ne 0 ]; then
+	            log $name "INFO" "Cannot start $name process. Ceph-mon is not working..."
+	            echo "Cannot start ceph-mds. Ceph-mon is not working..."
+	            EXIT_STATUS=$errcode
+		    continue
+	        fi
+	    fi
+
 	    save_proc_startup_ok $name

-	    echo Starting Ceph $name on $host...
 	    if [ ! -d $run_dir ]; then
 		# assume /var/run exists
 		install -d -m0770 -o ceph -g ceph /var/run/ceph
@ -1067,10 +1087,10 @@ EOF
 		# first try to gracefully close process, this should be fast if
 		# its threads still respond to the TERM signal
 		log $name "DEBUG" ">>> Sending term signal"
-		stop_daemon $name ceph-$type $pid_file TERM "" 5
+		stop_daemon $name ceph-$type $pid_file -SIGTERM "" 5
 		log $name "DEBUG" ">>> Sending kill signal"
 		# then just kill it
-		stop_daemon $name ceph-$type $pid_file KILL
+		stop_daemon $name ceph-$type $pid_file -SIGKILL
 	    fi

 	    [ -n "$pidfile" ] && rm -f $pidfile
@ -1182,7 +1202,17 @@ EOF
 	    get_conf pre_forcestop "" "pre forcestop command"
 	    get_conf post_forcestop "" "post forcestop command"
 	    [ -n "$pre_forcestop" ] && do_cmd "$pre_forcestop"
-	    stop_daemon $name ceph-$type $pid_file -9
+
+	    # first try to gracefully close process, this should be fast if
+	    # its threads still respond to the TERM signal
+	    wlog $name "DEBUG" ">>> Sending term signal"
+	    stop_daemon $name ceph-$type $pid_file -SIGTERM "" 5
+	    wlog $name "DEBUG" ">>> Sending kill signal"
+	    # then just kill it
+	    stop_daemon $name ceph-$type $pid_file -SIGKILL
+
+	    [ -n "$pidfile" ] && rm -f $pidfile
+	    [ -n "$asok" ] && rm -f $asok
 	    [ -n "$post_forcestop" ] && do_cmd "$post_forcestop"
 	    [ -n "$lockfile" ] && [ "$?" -eq 0 ] && rm -f $lockfile
 	    ;;
--- a/ceph/ceph/debian/deb_folder/ceph-base.install
+++ b/ceph/ceph/debian/deb_folder/ceph-base.install
@ -25,8 +25,10 @@ usr/bin/ceph-detect-init
 etc/init.d/ceph
 etc/init.d/mgr-restful-plugin
 etc/init.d/ceph-init-wrapper
+etc/init.d/ceph-storage-network
 etc/ceph/ceph.conf.pmon
 etc/ceph/ceph-mds.conf.pmon
+etc/ceph/ceph-fixed-mon.conf.pmon
 etc/ceph/ceph.conf
 etc/services.d/*
 usr/sbin/ceph-preshutdown.sh
--- a/ceph/ceph/debian/deb_folder/rules
+++ b/ceph/ceph/debian/deb_folder/rules
@ -14,7 +14,8 @@ SOURCE8 := ceph.service
 SOURCE9 := mgr-restful-plugin.service
 SOURCE10 := ceph-preshutdown.sh
 SOURCE11 := stx-containerd-ceph-override.conf
-
+SOURCE12 := ceph-storage-network.sh
+SOURCE13 := ceph-fixed-mon.conf.pmon

 # Paths
 export DESTDIR = $(CURDIR)/debian/tmp
@ -198,6 +199,8 @@ override_dh_auto_install:
 	install -D -m 644 ${SOURCE9} $(DESTDIR)/${UNITDIR}/mgr-restful-plugin.service
 	install -D -m 700 ${SOURCE10} $(DESTDIR)/${SBINDIR}/ceph-preshutdown.sh
 	install -D -m 644 ${SOURCE11} $(DESTDIR)/${UNITDIR}/containerd.service.d/stx-containerd-ceph-override.conf
+	install -D -m 750 ${SOURCE12} $(DESTDIR)/${INITDIR}/ceph-storage-network
+	install -D -m 750 ${SOURCE13} $(DESTDIR)/${SYSCONFDIR}/ceph/
 	install -m 750 src/init-radosgw $(DESTDIR)/${INITDIR}/ceph-radosgw
 	sed -i '/### END INIT INFO/a SYSTEMCTL_SKIP_REDIRECT=1' $(DESTDIR)/${INITDIR}/ceph-radosgw
 	install -m 750 src/init-rbdmap $(DESTDIR)/${INITDIR}/rbdmap
@ -280,6 +283,7 @@ override_dh_fixperms:
 	-Xceph.conf.pmon  \
 	-Xceph-mds.conf.pmon  \
 	-Xceph-init-wrapper  \
+	-Xceph-storage-network  \
 	-Xceph.conf  \
 	-Xceph-manage-journal  \
 	-Xceph.service  \
--- a/ceph/ceph/files/ceph-fixed-mon.conf.pmon
+++ b/ceph/ceph/files/ceph-fixed-mon.conf.pmon
@ -0,0 +1,26 @@
+[process]
+process  = ceph-fixed-mon
+script   = /etc/init.d/ceph-init-wrapper
+
+style    = lsb
+severity = major          ; minor, major, critical
+restarts = 5              ; restart retries before error assertion
+interval = 30             ; number of seconds to wait between restarts
+
+mode = status             ; Monitoring mode: passive (default) or active
+                          ; passive: process death monitoring (default: always)
+                          ; active : heartbeat monitoring, i.e. request / response messaging
+                          ; status : determine process health with executing "status" command
+                          ;          "start" is used to start the process(es) again
+                          ; ignore : do not monitor or stop monitoring
+
+; Status and Active Monitoring Options
+
+period     = 30           ; monitor period in seconds
+timeout    = 120          ; for active mode, messaging timeout period in seconds, must be shorter than period
+                          ; for status mode, max amount of time for a command to execute
+
+; Status Monitoring Options
+start_arg   = start mon.${HOSTNAME}  ; start argument for the script
+status_arg  = status mon.${HOSTNAME}  ; status argument for the script
+status_failure_text = /tmp/ceph_status_failure.txt   ; text to be added to alarms or logs, this is optional
--- a/ceph/ceph/files/ceph-init-wrapper.sh
+++ b/ceph/ceph/files/ceph-init-wrapper.sh
@ -84,6 +84,11 @@ args=("$@")
 if [ ! -z $ARGS ]; then
    IFS=";" read -r -a new_args <<< "$ARGS"
    args+=("${new_args[@]}")
+else
+    # Since PMON uses a unique string to pass arguments,
+    # it must support splitting the string into the args array.
+    #   Eg.: /etc/init.d/ceph-init-wrapper "start mds".
+    IFS=" " read -r -a args <<< "$@"
 fi

 # Log Management
@ -108,6 +113,23 @@ log () {
    return 0
 }

+# Identify the ceph network interface from /etc/platform/platform.conf file
+# The network interface will be set to the 'ceph_network_interface' variable
+# Return 0 if found the variable, and 1 if not.
+identify_ceph_network_interface() {
+    if [ "${ceph_network}" == "mgmt" ]; then
+        ceph_network_interface="${management_interface}"
+        return 0
+    fi
+
+    if [ "${ceph_network}" == "cluster-host" ]; then
+        ceph_network_interface="${cluster_host_interface}"
+        return 0
+    fi
+
+    return 1
+}
+
 # Verify if drbd-cephmon role is primary, checking the output of 'drbdadm role'
 # Return 0 on success and 1 if drbd-cephmon is not primary
 is_drbd_cephmon_primary ()
@ -143,11 +165,11 @@ is_drbd_cephmon_mounted ()
 has_all_network_no_carrier()
 {
    ip link show "${oam_interface}" | grep NO-CARRIER
-    oam_carrier=$?
+    local oam_carrier=$?
    ip link show "${cluster_host_interface}" | grep NO-CARRIER
-    cluster_host_carrier=$?
+    local cluster_host_carrier=$?
    ip link show "${management_interface}" | grep NO-CARRIER
-    mgmt_carrier=$?
+    local mgmt_carrier=$?

    # Check if all networks have no carrier, meaning the other host is down
    if [ "${oam_carrier}" -eq 0 ] && [ "${cluster_host_carrier}" -eq 0 ] && [ "${mgmt_carrier}" -eq 0 ]; then
@ -157,17 +179,23 @@ has_all_network_no_carrier()
    return 1
 }

-# Check mgmt network carrier signal
-has_mgmt_network_carrier()
+# Check Ceph network carrier signal
+has_ceph_network_carrier()
 {
-    # Checks the carrier (cable connected) for management interface
-    # If no-carrier message is detected, then the interface has no physical link
-    ip link show "${management_interface}" | grep NO-CARRIER
+    # Checks the carrier (cable connected) for Ceph network interface
+    # If no-carrier is detected, then the interface has no physical link
+    eval local interface=\$${ceph_network}_interface
+    if [ -z ${interface} ]; then
+        log ERROR "Cannot detect Ceph network. Skipping network carrier detection"
+        return 0
+    fi
+
+    ip link show "${interface}" | grep NO-CARRIER
    if [ $? -eq 0 ]; then
-        log INFO "Management Interface '${management_interface}' has NO-CARRIER, cannot start ceph-mon"
+        log INFO "Ceph network '${interface}' has NO-CARRIER, cannot start ceph-mon"
        return 1
    fi
-    log "-" DEBUG "Management Interface '${management_interface}' is working"
+    log DEBUG "Ceph network '${interface}' is working"
    return 0
 }

@ -256,6 +284,25 @@ with_service_lock ()
    RC=$?
 }

+has_daemon_running ()
+{
+    local service="$1"
+    if [ ${#service} -eq 3 ]; then
+        # Check based on service type
+        local count_pid_files=$(ls -1 /var/run/ceph/${service}.*.pid 2>/dev/null | wc -l)
+        if [ ${count_pid_files} -gt 0 ]; then
+            return 0
+        fi
+    else
+        # Check based on service name
+        if [ -f /var/run/ceph/${service}.pid ]; then
+            return 0
+        fi
+    fi
+
+    return 1
+}
+
 start ()
 {
    if [ ! -f ${CEPH_FILE} ]; then
@ -264,21 +311,25 @@ start ()
    fi

    local service="$1"
+    # Evaluate the parameter because of local monitor (controller.${HOSTNAME})
+    eval service="${service}"
+
+    log INFO "Ceph START ${service} command received"

    # For AIO-DX, ceph services have special treatment
    if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" != "simplex" ]; then

-        # For ceph mon, check if drbd-cephmon is ready
-        if [ "${service}" == "mon" ]; then
+        # For ceph mon.controller (floating monitor), check if drbd-cephmon is ready
+        if [ "${service}" == "mon.controller" ]; then
            can_start_ceph_mon
            if [ $? -ne 0 ]; then
-                log INFO "Ceph Monitor is not ready to start because drbd-cephmon is not ready and mounted"
+                log INFO "Ceph Monitor cannot start because drbd-cephmon is not ready and mounted."
                exit 1
            fi
        fi

-        # Check mgmt network state
-        has_mgmt_network_carrier
+        # Check Ceph network state
+        has_ceph_network_carrier
        if [ $? -ne 0 ]; then
            # If this is a AIO-DX Direct, check if all other network interfaces are down
            if [ "${system_mode}" == "duplex-direct" ]; then
@ -286,31 +337,43 @@ start ()
                if [ $? -eq 0 ]; then
                    log INFO "All network interfaces are not functional, considering the other host is down. Let Ceph start."
                else
-                    # Else AIO-DX Direct mgmt network is NOT functional
-                    log INFO "Management Interface is not functional, defer starting Ceph processes until recovered"
+                    # Else AIO-DX Direct Ceph network is NOT functional
+                    log INFO "Ceph network interface is not functional, defer starting Ceph processes until recovered"
                    exit 1
                fi
            else
-                # Else AIO-DX mgmt network is NOT functional
-                log INFO "Management Interface is not functional, defer starting Ceph processes until recovered"
+                # Else AIO-DX Ceph network is NOT functional
+                log INFO "Ceph network interface is not functional, defer starting Ceph processes until recovered"
                exit 1
            fi
        fi
    fi

    # Start the service
-    log INFO "Ceph START ${service} command received"
    with_service_lock "${service}" ${CEPH_SCRIPT} start ${service}
    log INFO "Ceph START ${service} command finished."
 }

 stop ()
 {
+    local cmd="stop"
    local service="$1"
+    # Evaluate the parameter because of local monitor (controller.${HOSTNAME})
+    eval service="${service}"
+    [ "$2" == "force" ] && cmd="forcestop"

-    log INFO "Ceph STOP $1 command received."
-    with_service_lock "$1" ${CEPH_SCRIPT} stop $1
-    log INFO "Ceph STOP $1 command finished."
+    log INFO "Ceph ${cmd^^} ${service} command received."
+
+    if [ ! -z "${service}"]; then
+        has_daemon_running ${service}
+        if [ $? -ne 0 ]; then
+            log INFO "Ceph ${service} daemon is already stopped. No action is required."
+            exit 0
+        fi
+    fi
+
+    with_service_lock "${service}" ${CEPH_SCRIPT} ${cmd} ${service}
+    log INFO "Ceph ${cmd^^} ${service} command finished."
 }

 restart ()
@ -386,6 +449,8 @@ log_and_kill_hung_procs ()
 status ()
 {
    local target="$1"  # no shift here
+    # Evaluate the parameter because of local monitor (controller.${HOSTNAME})
+    eval target="$target"
    [ -z "${target}" ] && target="mon osd"

    if [ ! -f ${CEPH_FILE} ]; then
@ -393,29 +458,31 @@ status ()
        exit 0
    fi

-    if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]] && [[ "$1" == "osd" ]]; then
-        has_mgmt_network_carrier
+    log INFO "status ${target}";
+
+    if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]] && [[ "$target" == "osd" ]]; then
+        has_ceph_network_carrier
        if [ $? -eq 0 ]; then
            # Network is functional, continue
-            log DEBUG "Management Interface active."
+            log DEBUG "Ceph network interface is active."
        else
            if [ "${system_mode}" == "duplex-direct" ]; then
                has_all_network_no_carrier
                if [ $? -ne 0 ]; then
                    # Network is NOT functional, prevent split brain corruptions
-                    log INFO "Management Interface inactive. Stopping OSDs to force a re-peering once the network has recovered"
-                    stop "$1"
+                    log INFO "Ceph network interface is inactive. Stopping OSDs to force a re-peering once the network has recovered"
+                    stop "$target"
                    exit 0
                fi
            else
                # Network is NOT functional, prevent split brain corruptions
-                log INFO "Management Interface inactive. Stopping OSDs to force a re-peering once the network has recovered"
-                stop "$1"
+                log INFO "Ceph network interface is inactive. Stopping OSDs to force a re-peering once the network has recovered"
+                stop "$target"
                exit 0
            fi
        fi

-        timeout $CEPH_STATUS_TIMEOUT ceph -s
+        timeout $CEPH_STATUS_TIMEOUT ceph -s 2>&1 1>/dev/null
        if [ "$?" -ne 0 ]; then
            # Ceph cluster is not accessible. Don't panic, controller swact
            # may be in progress.
@ -447,21 +514,22 @@ status ()
        flock --shared ${LOCK_CEPH_OSD_STATUS_FD}
    fi

-    result=`log INFO "status $1"; ${CEPH_SCRIPT} status $1 {LOCK_CEPH_MON_STATUS_FD}>&- {LOCK_CEPH_OSD_STATUS_FD}>&-`
+    result=`${CEPH_SCRIPT} status $target {LOCK_CEPH_MON_STATUS_FD}>&- {LOCK_CEPH_OSD_STATUS_FD}>&-`
    RC=$?
    if [ "$RC" -ne 0 ]; then
-        erred_procs=`echo "$result" | sort | uniq | awk ' /not running|dead|failed/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
-        hung_procs=`echo "$result" | sort | uniq | awk ' /hung/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
-        blocked_ops_procs=`echo "$result" | sort | uniq | awk ' /blocked ops/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
-        stuck_peering_procs=`echo "$result" | sort | uniq | awk ' /stuck peering/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
+        erred_procs=`echo "$result" | sort | uniq | awk ' /not running|dead|failed/ {printf "%s ", $target}' | sed 's/://g' | sed 's/, $//g'`
+        hung_procs=`echo "$result" | sort | uniq | awk ' /hung/ {printf "%s ", $target}' | sed 's/://g' | sed 's/, $//g'`
+        blocked_ops_procs=`echo "$result" | sort | uniq | awk ' /blocked ops/ {printf "%s ", $target}' | sed 's/://g' | sed 's/, $//g'`
+        stuck_peering_procs=`echo "$result" | sort | uniq | awk ' /stuck peering/ {printf "%s ", $target}' | sed 's/://g' | sed 's/, $//g'`
        invalid=0
        host=`hostname`
        if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then
            # On 2 node configuration we have a floating monitor
+            host_fixed="$host"
            host="controller"
        fi
        for i in $(echo $erred_procs $hung_procs); do
-            if [[ "$i" =~ osd.?[0-9]?[0-9]|mon.$host ]]; then
+            if [[ "$i" =~ osd.?[0-9]?[0-9]|mon.$host|mon.$host_fixed|mds.${HOSTNAME} ]]; then
                continue
            else
                invalid=1
@ -485,12 +553,12 @@ status ()
            done
            echo "$text" | tr -d '\n' > $CEPH_STATUS_FAILURE_TEXT_FILE
        else
-            echo "$host: '${CEPH_SCRIPT} status $1' result contains invalid process names: $erred_procs"
+            echo "$host: '${CEPH_SCRIPT} status $target' result contains invalid process names: $erred_procs"
            echo "Undetermined osd or monitor id" > $CEPH_STATUS_FAILURE_TEXT_FILE
        fi
    fi

-    if [[ $RC == 0 ]] && [[ "$1" == "mon" ]] && [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then
+    if [[ $RC == 0 ]] && [[ "$target" == "mon.controller" ]] && [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then
        # SM needs exit code != 0 from 'status mon' argument of the init script on
        # standby controller otherwise it thinks that the monitor is running and
        # tries to stop it.
@ -504,20 +572,20 @@ status ()
        if [ "$?" -ne 0 ]; then
            exit 3
        else
-            has_mgmt_network_carrier
+            has_ceph_network_carrier
            if [ $? -ne 0 ]; then
                if [ "${system_mode}" == "duplex-direct" ]; then
                    has_all_network_no_carrier
                    if [ $? -ne 0 ]; then
                        # Network is NOT functional, prevent split brain corruptions
-                        log INFO "Management Interface inactive. Stopping ceph-mon to prevent localized operation"
-                        stop "$1"
+                        log INFO "Ceph network interface is inactive. Stopping ceph-mon to prevent localized operation"
+                        stop "$target"
                        exit 0
                    fi
                else
                    # Network is NOT functional, prevent split brain corruptions
-                    log INFO "Management Interface inactive. Stopping ceph-mon to prevent localized operation"
-                    stop "$1"
+                    log INFO "Ceph network interface is inactive. Stopping ceph-mon to prevent localized operation"
+                    stop "$target"
                    exit 0
                fi
            fi
@ -535,6 +603,9 @@ case "${args[0]}" in
    stop)
        stop ${args[1]}
        ;;
+    forcestop)
+        stop ${args[1]} force
+        ;;
    restart)
        restart ${args[1]}
        ;;
@ -542,7 +613,7 @@ case "${args[0]}" in
        status ${args[1]}
        ;;
    *)
-        echo "Usage: $0 {start|stop|restart|status} [{mon|osd|osd.<number>|mon.<hostname>}]"
+        echo "Usage: $0 {start|stop|forcestop|restart|status} [{mon|osd|osd.<number>|mon.<hostname>}]"
        exit 1
        ;;
 esac
--- a/ceph/ceph/files/ceph-mds.conf.pmon
+++ b/ceph/ceph/files/ceph-mds.conf.pmon
@ -1,6 +1,6 @@
 [process]
 process  = ceph-mds
-script   = /etc/init.d/ceph
+script   = /etc/init.d/ceph-init-wrapper

 style    = lsb
 severity = major          ; minor, major, critical
--- a/ceph/ceph/files/ceph-storage-network.sh
+++ b/ceph/ceph/files/ceph-storage-network.sh
@ -0,0 +1,143 @@
+#!/bin/bash
+#
+# Copyright (c) 2024 Wind River Systems, Inc.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This script monitors the Ceph network for carrier on an AIO-DX system.
+# To prevent data corruption, when there is no carrier from the Ceph network,
+# the floating monitor, the osds and the mds processes will be stopped.
+
+source /etc/platform/platform.conf
+
+CEPH_FILE="/var/run/.ceph_started"
+CEPH_SCRIPT="/etc/init.d/ceph-init-wrapper"
+
+source /usr/lib/ceph/ceph_common.sh
+LOG_PATH=/var/log/ceph
+LOG_FILE=$LOG_PATH/ceph-process-states.log
+LOG_LEVEL=NORMAL  # DEBUG
+
+# Log Management
+# Adding PID and PPID informations
+log () {
+    local name=""
+    local log_level="$1"
+    # Checking if the first parameter is not a log level
+    if grep -q -v ${log_level} <<< "INFO DEBUG WARN ERROR"; then
+        name=" ($1)";
+        log_level="$2"
+        shift
+    fi
+
+    shift
+
+    local message="$@"
+    # prefix = <pid_subshell> <ppid_name>[<ppid>] <name|optional>
+    local prefix="${BASHPID} $(cat /proc/${PPID}/comm)[${PPID}]${name}"
+    # yyyy-MM-dd HH:mm:ss.SSSSSS /etc/init.d/ceph-storage-network <prefix> <log_level>: <message>
+    wlog "${prefix}" "${log_level}" "${message}"
+    return 0
+}
+
+identify_ceph_network_interface() {
+    if [ "${ceph_network}" == "mgmt" ]; then
+        ceph_network_interface="${management_interface}"
+        return 0
+    fi
+
+    if [ "${ceph_network}" == "cluster-host" ]; then
+        ceph_network_interface="${cluster_host_interface}"
+        return 0
+    fi
+
+    return 1
+}
+
+RETVAL=0
+
+################################################################################
+# Stop Ceph Services
+################################################################################
+
+stop()
+{
+    # This script should run only in AIO-DX called by sm. Double check it.
+    if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" != "simplex" ]; then
+        services="osd mds mon.controller"
+    else
+        services="osd mds mon"
+    fi
+
+    # sequentially stopping ceph-osd, ceph-mds, then ceph-mon
+    for service in $services; do
+        ${CEPH_SCRIPT} forcestop ${service}
+    done
+
+    return
+}
+
+################################################################################
+# Status Action
+################################################################################
+
+has_ceph_network_carrier()
+{
+    # Checks the carrier (cable connected) for Ceph network interface
+    # If no-carrier is detected, then the interface has no physical link
+    eval local interface=\$${ceph_network}_interface
+    if [ -z ${interface} ]; then
+        log ERROR "Cannot detect Ceph network. Skipping network carrier detection"
+        return 0
+    fi
+
+    ip link show "${interface}" | grep NO-CARRIER
+    if [ $? -eq 0 ]; then
+        log INFO "Ceph network '${interface}' has NO-CARRIER, cannot start ceph-mon"
+        return 1
+    fi
+    return 0
+}
+
+status()
+{
+    if [ ! -f ${CEPH_FILE} ]; then
+        # Ceph is not running on this node, return success
+        return
+    fi
+
+    has_ceph_network_carrier
+    if [ $? -ne 0 ]; then
+        # communication failure detected
+        # stopping ceph services to avoid data corruption
+        stop
+        RETVAL=1
+    fi
+
+    return
+}
+
+################################################################################
+
+# Main Entry
+
+################################################################################
+
+case "$1" in
+    start)
+        status
+        ;;
+    stop)
+        RETVAL=0
+        ;;
+    status)
+        status
+        ;;
+    *)
+        echo "usage: $0 { start | stop | status }"
+        exit 1
+        ;;
+esac
+
+exit $RETVAL
+
--- a/config/puppet-modules/openstack/puppet-ceph-2.4.1/debian/patches/0017-Add-mon_data-parameter.patch
+++ b/config/puppet-modules/openstack/puppet-ceph-2.4.1/debian/patches/0017-Add-mon_data-parameter.patch
@ -0,0 +1,110 @@
+From e6f5d2cd267564ee97f53447ba1419d1ace641a1 Mon Sep 17 00:00:00 2001
+From: Hediberto C Silva <hediberto.cavalcantedasilva@windriver.com>
+Date: Tue, 19 Mar 2024 17:17:10 -0300
+Subject: [PATCH] Add mon_data parameter
+
+If supplied, the mon_data parameter defines where the ceph-mon data
+will be located.
+
+Signed-off-by: Hediberto C Silva <hediberto.cavalcantedasilva@windriver.com>
+---
+ manifests/mon.pp | 33 ++++++++++++++++++++++++++-------
+ 1 file changed, 26 insertions(+), 7 deletions(-)
+
+diff --git a/manifests/mon.pp b/manifests/mon.pp
+index 6d1294e..4615d3c 100644
+--- a/manifests/mon.pp
+++ b/manifests/mon.pp
+@@ -60,6 +60,7 @@
+ define ceph::mon (
+   $ensure = present,
+   $mon_enable = true,
+  $mon_data = '',
+   $public_addr = undef,
+   $cluster = undef,
+   $authentication_type = 'cephx',
+@@ -139,7 +140,10 @@ chmod 0444 ${keyring_path}
+ ",
+             unless  => "/bin/true # comment to satisfy puppet syntax requirements
+ set -ex
+-mon_data=\$(ceph-mon ${cluster_option} --id ${id} --show-config-value mon_data) || exit 1
+mon_data=\"${mon_data}\"
+if [ -z \${mon_data} ]; then
+  mon_data=\$(ceph-mon ${cluster_option} --id ${id} --show-config-value mon_data) || exit 1
+fi
+ # if ceph-mon fails then the mon is probably not configured yet
+ test -e \$mon_data/done
+ ",
+@@ -178,7 +182,10 @@ test -e /etc/ceph/${cluster_name}.client.admin.keyring",
+       -> exec { $ceph_mkfs:
+         command   => "/bin/true # comment to satisfy puppet syntax requirements
+ set -ex
+-mon_data=\$(ceph-mon ${cluster_option} --id ${id} --show-config-value mon_data)
+mon_data=\"${mon_data}\"
+if [ -z \${mon_data} ]; then
+  mon_data=\$(ceph-mon ${cluster_option} --id ${id} --show-config-value mon_data)
+fi
+ if [ ! -d \$mon_data ] ; then
+     mkdir -p \$mon_data
+     if getent passwd ceph >/dev/null 2>&1; then
+@@ -188,7 +195,8 @@ if [ ! -d \$mon_data ] ; then
+               --mkfs \
+               --id ${id} \
+               --keyring ${keyring_path} \
+-              ${fsid_option} ; then
+              ${fsid_option} \
+              --mon-data \$mon_data ; then
+             touch \$mon_data/done \$mon_data/${init} \$mon_data/keyring
+             chown -h ceph:ceph \$mon_data/done \$mon_data/${init} \$mon_data/keyring
+         else
+@@ -199,7 +207,8 @@ if [ ! -d \$mon_data ] ; then
+               --mkfs \
+               --id ${id} \
+               --keyring ${keyring_path} \
+-              ${fsid_option} ; then
+              ${fsid_option} \
+              --mon-data \$mon_data ; then
+             touch \$mon_data/done \$mon_data/${init} \$mon_data/keyring
+         else
+             rm -fr \$mon_data
+@@ -209,7 +218,10 @@ fi
+ ",
+         unless    => "/bin/true # comment to satisfy puppet syntax requirements
+ set -ex
+-mon_data=\$(ceph-mon ${cluster_option} --id ${id} --show-config-value mon_data)
+mon_data=\"${mon_data}\"
+if [ -z \${mon_data} ]; then
+  mon_data=\$(ceph-mon ${cluster_option} --id ${id} --show-config-value mon_data)
+fi
+ test -d  \$mon_data
+ ",
+         logoutput => true,
+@@ -244,13 +256,19 @@ test ! -e ${keyring_path}
+       -> exec { "remove-mon-${id}":
+         command   => "/bin/true # comment to satisfy puppet syntax requirements
+ set -ex
+-mon_data=\$(ceph-mon ${cluster_option} --id ${id} --show-config-value mon_data)
+mon_data=\"${mon_data}\"
+if [ -z \${mon_data} ]; then
+  mon_data=\$(ceph-mon ${cluster_option} --id ${id} --show-config-value mon_data)
+fi
+ rm -fr \$mon_data
+ ",
+         unless    => "/bin/true # comment to satisfy puppet syntax requirements
+ set -ex
+ which ceph-mon || exit 0 # if ceph-mon is not available we already uninstalled ceph and there is nothing to do
+-mon_data=\$(ceph-mon ${cluster_option} --id ${id} --show-config-value mon_data)
+mon_data=\"${mon_data}\"
+if [ -z \${mon_data} ]; then
+  mon_data=\$(ceph-mon ${cluster_option} --id ${id} --show-config-value mon_data)
+fi
+ test ! -d \$mon_data
+ ",
+         logoutput => true,
+@@ -263,3 +281,4 @@ test ! -d \$mon_data
+       fail('Ensure on MON must be either present or absent')
+     }
+   }
+
+-- 
+2.34.1
--- a/config/puppet-modules/openstack/puppet-ceph-2.4.1/debian/patches/series
+++ b/config/puppet-modules/openstack/puppet-ceph-2.4.1/debian/patches/series
@ -13,3 +13,4 @@
 0014-Adjust-puppet-ceph-dependency-requirements.patch
 0015-Fix-the-unless-condition-of-ceph-osd-prepare.patch
 0016-Add-ms_bind_ipv4-option-to-ceph-paremeters.patch
+0017-Add-mon_data-parameter.patch