Adds support for 3 monitors on AIO-DX

Adding two fixed Ceph monitors for controllers when deploying an
AIO-DX to improve HA.

Includes:
 - Creates a new shell script to manage the fixed monitors
 - Creates a new patch to include the mon_data parameter in
   the mon.pp puppet manifest.
 - Creates the ceph-storage-network script that is used by the
   storage-networking SM service to stop Ceph services in case
   of a network outage.
 - Alters the script used by the ceph-mds pmon file to
   ceph-init-wrapper.
 - Adjusts the ceph-init-wrapper to accept commands from pmon service.
 - Adjusts the ceph-init-wrapper to accept the forcestop command.
 - Stopping Ceph services using ceph-init-wrapper, it is checked
   if the pid exists before trying.
 - Stopping ceph-mon service using ceph-init-wrapper,
   the ceph-mds is stopped right before to force a re-peering.
 - Starting ceph-mon service using ceph-init-wrapper,
   the ceph-mds is stopped right before to force a re-peering.
 - Starting ceph-mds, it is checked if the ceph-mon is operational.
 - The forcestop command uses a TERM signal first before
   attempting a KILL signal after 5 seconds.

Test Plan:
  PASS: Fresh install AIO-DX and check 3 Ceph monitors are running.
  PASS: Fresh install all other setups and check if Ceph is working as
expected.
  PASS: Reboots the standby controller and check if Ceph is still
running.
  PASS: Reboots the active controller. Ceph will stop responding, but
it will recover after both controllers are running.
  PASS: Verify Ceph is working after a DOR test with PODs writting
to the cephfs and rbd pools.
  PASS: Verify Ceph is resilient to switch reboots

Story: 2011122
Task: 50129

Change-Id: I18d7ab9da3303265da34bc13c8be4baa23c2a7be
Signed-off-by: Hediberto C Silva <hediberto.cavalcantedasilva@windriver.com>
Signed-off-by: Felipe Sanches Zanoni <Felipe.SanchesZanoni@windriver.com>
This commit is contained in:
Hediberto C Silva 2024-04-02 16:15:54 -03:00 committed by Felipe Sanches Zanoni
parent 8e1e55284e
commit d5a84a1dbc
9 changed files with 447 additions and 60 deletions

View File

@ -623,7 +623,7 @@ stop_daemon() {
if [ \$timeout -lt 0 ]; then
break
fi
timeout-=1
timeout=\$((timeout-1))
fi
cmd=\"kill $signal \$pid\"
printf \"\$cmd...\"
@ -769,7 +769,7 @@ fi
# When this is a AIO-DX pmon is monitoring ceph-mds process.
# If ceph-mon is not running, ceph-mds will hang when starting.
# Check if we are trying to bring up ceph-mds and ceph-mon is not ready yet
if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" == "duplex" ]; then
if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" != "simplex" ]; then
if [ "${command}" = "start" -o "${command}" = "onestart" ]; then
what_out=
what_mds=
@ -873,6 +873,12 @@ for name in $what; do
# conf file
cmd="$cmd -c $conf"
# StarlingX:
# If this is AIO-DX, check if service is the fixed Ceph monitor and set the parameter --mon-data
if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" != "simplex" ] && [ "$type" == "mon" ] && [ "$id" == ${HOSTNAME} ]; then
cmd="$cmd --mon-data /var/lib/ceph/data/ceph-${HOSTNAME}"
fi
if echo $name | grep -q ^osd; then
get_conf osd_data "/var/lib/ceph/osd/$cluster-$id" "osd data"
get_conf fs_path "$osd_data" "fs path" # mount point defaults so osd data
@ -928,13 +934,15 @@ for name in $what; do
[ -n "$TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES" ] && tcmalloc="TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES=$TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES"
# StarlingX: start processes in scope under slice system-ceph.slice
# so that ceph processes do not start under this script's callers cgroup
if [ "$type" = "osd" ]; then
cmd="systemd-run --scope --unit=ceph-${type}-${id} --slice=system-ceph $cmd"
else
cmd="systemd-run --scope --unit=ceph-${type} --slice=system-ceph $cmd"
fi
# StarlingX: start processes in scope under slice system-ceph.slice
# so that ceph processes do not start under this script's callers cgroup
if [ "$type" = "osd" ]; then
cmd="systemd-run --scope --unit=ceph-${type}-${id} --slice=system-ceph $cmd"
elif [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" != "simplex" ] && [ "$type" == "mon" ] && [ "$id" == ${HOSTNAME} ]; then
cmd="systemd-run --scope --unit=ceph-${type}-${id} --slice=system-ceph $cmd"
else
cmd="systemd-run --scope --unit=ceph-${type} --slice=system-ceph $cmd"
fi
# StarlingX: not running as ceph user/group
cmd="$files $tcmalloc $wrap $cmd --cluster $cluster $runmode"
@ -988,9 +996,21 @@ for name in $what; do
fi
fi
echo Starting Ceph $name on $host...
if [ $type = "mds" ]; then
echo "Waiting for ceph-mon to respond before starting ceph-mds..."
execute_ceph_cmd CMD_OUTPUT $name "ceph fsid"
if [ $? -ne 0 ]; then
log $name "INFO" "Cannot start $name process. Ceph-mon is not working..."
echo "Cannot start ceph-mds. Ceph-mon is not working..."
EXIT_STATUS=$errcode
continue
fi
fi
save_proc_startup_ok $name
echo Starting Ceph $name on $host...
if [ ! -d $run_dir ]; then
# assume /var/run exists
install -d -m0770 -o ceph -g ceph /var/run/ceph
@ -1067,10 +1087,10 @@ EOF
# first try to gracefully close process, this should be fast if
# its threads still respond to the TERM signal
log $name "DEBUG" ">>> Sending term signal"
stop_daemon $name ceph-$type $pid_file TERM "" 5
stop_daemon $name ceph-$type $pid_file -SIGTERM "" 5
log $name "DEBUG" ">>> Sending kill signal"
# then just kill it
stop_daemon $name ceph-$type $pid_file KILL
stop_daemon $name ceph-$type $pid_file -SIGKILL
fi
[ -n "$pidfile" ] && rm -f $pidfile
@ -1182,7 +1202,17 @@ EOF
get_conf pre_forcestop "" "pre forcestop command"
get_conf post_forcestop "" "post forcestop command"
[ -n "$pre_forcestop" ] && do_cmd "$pre_forcestop"
stop_daemon $name ceph-$type $pid_file -9
# first try to gracefully close process, this should be fast if
# its threads still respond to the TERM signal
wlog $name "DEBUG" ">>> Sending term signal"
stop_daemon $name ceph-$type $pid_file -SIGTERM "" 5
wlog $name "DEBUG" ">>> Sending kill signal"
# then just kill it
stop_daemon $name ceph-$type $pid_file -SIGKILL
[ -n "$pidfile" ] && rm -f $pidfile
[ -n "$asok" ] && rm -f $asok
[ -n "$post_forcestop" ] && do_cmd "$post_forcestop"
[ -n "$lockfile" ] && [ "$?" -eq 0 ] && rm -f $lockfile
;;

View File

@ -25,8 +25,10 @@ usr/bin/ceph-detect-init
etc/init.d/ceph
etc/init.d/mgr-restful-plugin
etc/init.d/ceph-init-wrapper
etc/init.d/ceph-storage-network
etc/ceph/ceph.conf.pmon
etc/ceph/ceph-mds.conf.pmon
etc/ceph/ceph-fixed-mon.conf.pmon
etc/ceph/ceph.conf
etc/services.d/*
usr/sbin/ceph-preshutdown.sh

View File

@ -14,7 +14,8 @@ SOURCE8 := ceph.service
SOURCE9 := mgr-restful-plugin.service
SOURCE10 := ceph-preshutdown.sh
SOURCE11 := stx-containerd-ceph-override.conf
SOURCE12 := ceph-storage-network.sh
SOURCE13 := ceph-fixed-mon.conf.pmon
# Paths
export DESTDIR = $(CURDIR)/debian/tmp
@ -198,6 +199,8 @@ override_dh_auto_install:
install -D -m 644 ${SOURCE9} $(DESTDIR)/${UNITDIR}/mgr-restful-plugin.service
install -D -m 700 ${SOURCE10} $(DESTDIR)/${SBINDIR}/ceph-preshutdown.sh
install -D -m 644 ${SOURCE11} $(DESTDIR)/${UNITDIR}/containerd.service.d/stx-containerd-ceph-override.conf
install -D -m 750 ${SOURCE12} $(DESTDIR)/${INITDIR}/ceph-storage-network
install -D -m 750 ${SOURCE13} $(DESTDIR)/${SYSCONFDIR}/ceph/
install -m 750 src/init-radosgw $(DESTDIR)/${INITDIR}/ceph-radosgw
sed -i '/### END INIT INFO/a SYSTEMCTL_SKIP_REDIRECT=1' $(DESTDIR)/${INITDIR}/ceph-radosgw
install -m 750 src/init-rbdmap $(DESTDIR)/${INITDIR}/rbdmap
@ -280,6 +283,7 @@ override_dh_fixperms:
-Xceph.conf.pmon \
-Xceph-mds.conf.pmon \
-Xceph-init-wrapper \
-Xceph-storage-network \
-Xceph.conf \
-Xceph-manage-journal \
-Xceph.service \

View File

@ -0,0 +1,26 @@
[process]
process = ceph-fixed-mon
script = /etc/init.d/ceph-init-wrapper
style = lsb
severity = major ; minor, major, critical
restarts = 5 ; restart retries before error assertion
interval = 30 ; number of seconds to wait between restarts
mode = status ; Monitoring mode: passive (default) or active
; passive: process death monitoring (default: always)
; active : heartbeat monitoring, i.e. request / response messaging
; status : determine process health with executing "status" command
; "start" is used to start the process(es) again
; ignore : do not monitor or stop monitoring
; Status and Active Monitoring Options
period = 30 ; monitor period in seconds
timeout = 120 ; for active mode, messaging timeout period in seconds, must be shorter than period
; for status mode, max amount of time for a command to execute
; Status Monitoring Options
start_arg = start mon.${HOSTNAME} ; start argument for the script
status_arg = status mon.${HOSTNAME} ; status argument for the script
status_failure_text = /tmp/ceph_status_failure.txt ; text to be added to alarms or logs, this is optional

View File

@ -84,6 +84,11 @@ args=("$@")
if [ ! -z $ARGS ]; then
IFS=";" read -r -a new_args <<< "$ARGS"
args+=("${new_args[@]}")
else
# Since PMON uses a unique string to pass arguments,
# it must support splitting the string into the args array.
# Eg.: /etc/init.d/ceph-init-wrapper "start mds".
IFS=" " read -r -a args <<< "$@"
fi
# Log Management
@ -108,6 +113,23 @@ log () {
return 0
}
# Identify the ceph network interface from /etc/platform/platform.conf file
# The network interface will be set to the 'ceph_network_interface' variable
# Return 0 if found the variable, and 1 if not.
identify_ceph_network_interface() {
if [ "${ceph_network}" == "mgmt" ]; then
ceph_network_interface="${management_interface}"
return 0
fi
if [ "${ceph_network}" == "cluster-host" ]; then
ceph_network_interface="${cluster_host_interface}"
return 0
fi
return 1
}
# Verify if drbd-cephmon role is primary, checking the output of 'drbdadm role'
# Return 0 on success and 1 if drbd-cephmon is not primary
is_drbd_cephmon_primary ()
@ -143,11 +165,11 @@ is_drbd_cephmon_mounted ()
has_all_network_no_carrier()
{
ip link show "${oam_interface}" | grep NO-CARRIER
oam_carrier=$?
local oam_carrier=$?
ip link show "${cluster_host_interface}" | grep NO-CARRIER
cluster_host_carrier=$?
local cluster_host_carrier=$?
ip link show "${management_interface}" | grep NO-CARRIER
mgmt_carrier=$?
local mgmt_carrier=$?
# Check if all networks have no carrier, meaning the other host is down
if [ "${oam_carrier}" -eq 0 ] && [ "${cluster_host_carrier}" -eq 0 ] && [ "${mgmt_carrier}" -eq 0 ]; then
@ -157,17 +179,23 @@ has_all_network_no_carrier()
return 1
}
# Check mgmt network carrier signal
has_mgmt_network_carrier()
# Check Ceph network carrier signal
has_ceph_network_carrier()
{
# Checks the carrier (cable connected) for management interface
# If no-carrier message is detected, then the interface has no physical link
ip link show "${management_interface}" | grep NO-CARRIER
# Checks the carrier (cable connected) for Ceph network interface
# If no-carrier is detected, then the interface has no physical link
eval local interface=\$${ceph_network}_interface
if [ -z ${interface} ]; then
log ERROR "Cannot detect Ceph network. Skipping network carrier detection"
return 0
fi
ip link show "${interface}" | grep NO-CARRIER
if [ $? -eq 0 ]; then
log INFO "Management Interface '${management_interface}' has NO-CARRIER, cannot start ceph-mon"
log INFO "Ceph network '${interface}' has NO-CARRIER, cannot start ceph-mon"
return 1
fi
log "-" DEBUG "Management Interface '${management_interface}' is working"
log DEBUG "Ceph network '${interface}' is working"
return 0
}
@ -256,6 +284,25 @@ with_service_lock ()
RC=$?
}
has_daemon_running ()
{
local service="$1"
if [ ${#service} -eq 3 ]; then
# Check based on service type
local count_pid_files=$(ls -1 /var/run/ceph/${service}.*.pid 2>/dev/null | wc -l)
if [ ${count_pid_files} -gt 0 ]; then
return 0
fi
else
# Check based on service name
if [ -f /var/run/ceph/${service}.pid ]; then
return 0
fi
fi
return 1
}
start ()
{
if [ ! -f ${CEPH_FILE} ]; then
@ -264,21 +311,25 @@ start ()
fi
local service="$1"
# Evaluate the parameter because of local monitor (controller.${HOSTNAME})
eval service="${service}"
log INFO "Ceph START ${service} command received"
# For AIO-DX, ceph services have special treatment
if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" != "simplex" ]; then
# For ceph mon, check if drbd-cephmon is ready
if [ "${service}" == "mon" ]; then
# For ceph mon.controller (floating monitor), check if drbd-cephmon is ready
if [ "${service}" == "mon.controller" ]; then
can_start_ceph_mon
if [ $? -ne 0 ]; then
log INFO "Ceph Monitor is not ready to start because drbd-cephmon is not ready and mounted"
log INFO "Ceph Monitor cannot start because drbd-cephmon is not ready and mounted."
exit 1
fi
fi
# Check mgmt network state
has_mgmt_network_carrier
# Check Ceph network state
has_ceph_network_carrier
if [ $? -ne 0 ]; then
# If this is a AIO-DX Direct, check if all other network interfaces are down
if [ "${system_mode}" == "duplex-direct" ]; then
@ -286,31 +337,43 @@ start ()
if [ $? -eq 0 ]; then
log INFO "All network interfaces are not functional, considering the other host is down. Let Ceph start."
else
# Else AIO-DX Direct mgmt network is NOT functional
log INFO "Management Interface is not functional, defer starting Ceph processes until recovered"
# Else AIO-DX Direct Ceph network is NOT functional
log INFO "Ceph network interface is not functional, defer starting Ceph processes until recovered"
exit 1
fi
else
# Else AIO-DX mgmt network is NOT functional
log INFO "Management Interface is not functional, defer starting Ceph processes until recovered"
# Else AIO-DX Ceph network is NOT functional
log INFO "Ceph network interface is not functional, defer starting Ceph processes until recovered"
exit 1
fi
fi
fi
# Start the service
log INFO "Ceph START ${service} command received"
with_service_lock "${service}" ${CEPH_SCRIPT} start ${service}
log INFO "Ceph START ${service} command finished."
}
stop ()
{
local cmd="stop"
local service="$1"
# Evaluate the parameter because of local monitor (controller.${HOSTNAME})
eval service="${service}"
[ "$2" == "force" ] && cmd="forcestop"
log INFO "Ceph STOP $1 command received."
with_service_lock "$1" ${CEPH_SCRIPT} stop $1
log INFO "Ceph STOP $1 command finished."
log INFO "Ceph ${cmd^^} ${service} command received."
if [ ! -z "${service}"]; then
has_daemon_running ${service}
if [ $? -ne 0 ]; then
log INFO "Ceph ${service} daemon is already stopped. No action is required."
exit 0
fi
fi
with_service_lock "${service}" ${CEPH_SCRIPT} ${cmd} ${service}
log INFO "Ceph ${cmd^^} ${service} command finished."
}
restart ()
@ -386,6 +449,8 @@ log_and_kill_hung_procs ()
status ()
{
local target="$1" # no shift here
# Evaluate the parameter because of local monitor (controller.${HOSTNAME})
eval target="$target"
[ -z "${target}" ] && target="mon osd"
if [ ! -f ${CEPH_FILE} ]; then
@ -393,29 +458,31 @@ status ()
exit 0
fi
if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]] && [[ "$1" == "osd" ]]; then
has_mgmt_network_carrier
log INFO "status ${target}";
if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]] && [[ "$target" == "osd" ]]; then
has_ceph_network_carrier
if [ $? -eq 0 ]; then
# Network is functional, continue
log DEBUG "Management Interface active."
log DEBUG "Ceph network interface is active."
else
if [ "${system_mode}" == "duplex-direct" ]; then
has_all_network_no_carrier
if [ $? -ne 0 ]; then
# Network is NOT functional, prevent split brain corruptions
log INFO "Management Interface inactive. Stopping OSDs to force a re-peering once the network has recovered"
stop "$1"
log INFO "Ceph network interface is inactive. Stopping OSDs to force a re-peering once the network has recovered"
stop "$target"
exit 0
fi
else
# Network is NOT functional, prevent split brain corruptions
log INFO "Management Interface inactive. Stopping OSDs to force a re-peering once the network has recovered"
stop "$1"
log INFO "Ceph network interface is inactive. Stopping OSDs to force a re-peering once the network has recovered"
stop "$target"
exit 0
fi
fi
timeout $CEPH_STATUS_TIMEOUT ceph -s
timeout $CEPH_STATUS_TIMEOUT ceph -s 2>&1 1>/dev/null
if [ "$?" -ne 0 ]; then
# Ceph cluster is not accessible. Don't panic, controller swact
# may be in progress.
@ -447,21 +514,22 @@ status ()
flock --shared ${LOCK_CEPH_OSD_STATUS_FD}
fi
result=`log INFO "status $1"; ${CEPH_SCRIPT} status $1 {LOCK_CEPH_MON_STATUS_FD}>&- {LOCK_CEPH_OSD_STATUS_FD}>&-`
result=`${CEPH_SCRIPT} status $target {LOCK_CEPH_MON_STATUS_FD}>&- {LOCK_CEPH_OSD_STATUS_FD}>&-`
RC=$?
if [ "$RC" -ne 0 ]; then
erred_procs=`echo "$result" | sort | uniq | awk ' /not running|dead|failed/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
hung_procs=`echo "$result" | sort | uniq | awk ' /hung/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
blocked_ops_procs=`echo "$result" | sort | uniq | awk ' /blocked ops/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
stuck_peering_procs=`echo "$result" | sort | uniq | awk ' /stuck peering/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
erred_procs=`echo "$result" | sort | uniq | awk ' /not running|dead|failed/ {printf "%s ", $target}' | sed 's/://g' | sed 's/, $//g'`
hung_procs=`echo "$result" | sort | uniq | awk ' /hung/ {printf "%s ", $target}' | sed 's/://g' | sed 's/, $//g'`
blocked_ops_procs=`echo "$result" | sort | uniq | awk ' /blocked ops/ {printf "%s ", $target}' | sed 's/://g' | sed 's/, $//g'`
stuck_peering_procs=`echo "$result" | sort | uniq | awk ' /stuck peering/ {printf "%s ", $target}' | sed 's/://g' | sed 's/, $//g'`
invalid=0
host=`hostname`
if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then
# On 2 node configuration we have a floating monitor
host_fixed="$host"
host="controller"
fi
for i in $(echo $erred_procs $hung_procs); do
if [[ "$i" =~ osd.?[0-9]?[0-9]|mon.$host ]]; then
if [[ "$i" =~ osd.?[0-9]?[0-9]|mon.$host|mon.$host_fixed|mds.${HOSTNAME} ]]; then
continue
else
invalid=1
@ -485,12 +553,12 @@ status ()
done
echo "$text" | tr -d '\n' > $CEPH_STATUS_FAILURE_TEXT_FILE
else
echo "$host: '${CEPH_SCRIPT} status $1' result contains invalid process names: $erred_procs"
echo "$host: '${CEPH_SCRIPT} status $target' result contains invalid process names: $erred_procs"
echo "Undetermined osd or monitor id" > $CEPH_STATUS_FAILURE_TEXT_FILE
fi
fi
if [[ $RC == 0 ]] && [[ "$1" == "mon" ]] && [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then
if [[ $RC == 0 ]] && [[ "$target" == "mon.controller" ]] && [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then
# SM needs exit code != 0 from 'status mon' argument of the init script on
# standby controller otherwise it thinks that the monitor is running and
# tries to stop it.
@ -504,20 +572,20 @@ status ()
if [ "$?" -ne 0 ]; then
exit 3
else
has_mgmt_network_carrier
has_ceph_network_carrier
if [ $? -ne 0 ]; then
if [ "${system_mode}" == "duplex-direct" ]; then
has_all_network_no_carrier
if [ $? -ne 0 ]; then
# Network is NOT functional, prevent split brain corruptions
log INFO "Management Interface inactive. Stopping ceph-mon to prevent localized operation"
stop "$1"
log INFO "Ceph network interface is inactive. Stopping ceph-mon to prevent localized operation"
stop "$target"
exit 0
fi
else
# Network is NOT functional, prevent split brain corruptions
log INFO "Management Interface inactive. Stopping ceph-mon to prevent localized operation"
stop "$1"
log INFO "Ceph network interface is inactive. Stopping ceph-mon to prevent localized operation"
stop "$target"
exit 0
fi
fi
@ -535,6 +603,9 @@ case "${args[0]}" in
stop)
stop ${args[1]}
;;
forcestop)
stop ${args[1]} force
;;
restart)
restart ${args[1]}
;;
@ -542,7 +613,7 @@ case "${args[0]}" in
status ${args[1]}
;;
*)
echo "Usage: $0 {start|stop|restart|status} [{mon|osd|osd.<number>|mon.<hostname>}]"
echo "Usage: $0 {start|stop|forcestop|restart|status} [{mon|osd|osd.<number>|mon.<hostname>}]"
exit 1
;;
esac

View File

@ -1,6 +1,6 @@
[process]
process = ceph-mds
script = /etc/init.d/ceph
script = /etc/init.d/ceph-init-wrapper
style = lsb
severity = major ; minor, major, critical

View File

@ -0,0 +1,143 @@
#!/bin/bash
#
# Copyright (c) 2024 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# This script monitors the Ceph network for carrier on an AIO-DX system.
# To prevent data corruption, when there is no carrier from the Ceph network,
# the floating monitor, the osds and the mds processes will be stopped.
source /etc/platform/platform.conf
CEPH_FILE="/var/run/.ceph_started"
CEPH_SCRIPT="/etc/init.d/ceph-init-wrapper"
source /usr/lib/ceph/ceph_common.sh
LOG_PATH=/var/log/ceph
LOG_FILE=$LOG_PATH/ceph-process-states.log
LOG_LEVEL=NORMAL # DEBUG
# Log Management
# Adding PID and PPID informations
log () {
local name=""
local log_level="$1"
# Checking if the first parameter is not a log level
if grep -q -v ${log_level} <<< "INFO DEBUG WARN ERROR"; then
name=" ($1)";
log_level="$2"
shift
fi
shift
local message="$@"
# prefix = <pid_subshell> <ppid_name>[<ppid>] <name|optional>
local prefix="${BASHPID} $(cat /proc/${PPID}/comm)[${PPID}]${name}"
# yyyy-MM-dd HH:mm:ss.SSSSSS /etc/init.d/ceph-storage-network <prefix> <log_level>: <message>
wlog "${prefix}" "${log_level}" "${message}"
return 0
}
identify_ceph_network_interface() {
if [ "${ceph_network}" == "mgmt" ]; then
ceph_network_interface="${management_interface}"
return 0
fi
if [ "${ceph_network}" == "cluster-host" ]; then
ceph_network_interface="${cluster_host_interface}"
return 0
fi
return 1
}
RETVAL=0
################################################################################
# Stop Ceph Services
################################################################################
stop()
{
# This script should run only in AIO-DX called by sm. Double check it.
if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" != "simplex" ]; then
services="osd mds mon.controller"
else
services="osd mds mon"
fi
# sequentially stopping ceph-osd, ceph-mds, then ceph-mon
for service in $services; do
${CEPH_SCRIPT} forcestop ${service}
done
return
}
################################################################################
# Status Action
################################################################################
has_ceph_network_carrier()
{
# Checks the carrier (cable connected) for Ceph network interface
# If no-carrier is detected, then the interface has no physical link
eval local interface=\$${ceph_network}_interface
if [ -z ${interface} ]; then
log ERROR "Cannot detect Ceph network. Skipping network carrier detection"
return 0
fi
ip link show "${interface}" | grep NO-CARRIER
if [ $? -eq 0 ]; then
log INFO "Ceph network '${interface}' has NO-CARRIER, cannot start ceph-mon"
return 1
fi
return 0
}
status()
{
if [ ! -f ${CEPH_FILE} ]; then
# Ceph is not running on this node, return success
return
fi
has_ceph_network_carrier
if [ $? -ne 0 ]; then
# communication failure detected
# stopping ceph services to avoid data corruption
stop
RETVAL=1
fi
return
}
################################################################################
# Main Entry
################################################################################
case "$1" in
start)
status
;;
stop)
RETVAL=0
;;
status)
status
;;
*)
echo "usage: $0 { start | stop | status }"
exit 1
;;
esac
exit $RETVAL

View File

@ -0,0 +1,110 @@
From e6f5d2cd267564ee97f53447ba1419d1ace641a1 Mon Sep 17 00:00:00 2001
From: Hediberto C Silva <hediberto.cavalcantedasilva@windriver.com>
Date: Tue, 19 Mar 2024 17:17:10 -0300
Subject: [PATCH] Add mon_data parameter
If supplied, the mon_data parameter defines where the ceph-mon data
will be located.
Signed-off-by: Hediberto C Silva <hediberto.cavalcantedasilva@windriver.com>
---
manifests/mon.pp | 33 ++++++++++++++++++++++++++-------
1 file changed, 26 insertions(+), 7 deletions(-)
diff --git a/manifests/mon.pp b/manifests/mon.pp
index 6d1294e..4615d3c 100644
--- a/manifests/mon.pp
+++ b/manifests/mon.pp
@@ -60,6 +60,7 @@
define ceph::mon (
$ensure = present,
$mon_enable = true,
+ $mon_data = '',
$public_addr = undef,
$cluster = undef,
$authentication_type = 'cephx',
@@ -139,7 +140,10 @@ chmod 0444 ${keyring_path}
",
unless => "/bin/true # comment to satisfy puppet syntax requirements
set -ex
-mon_data=\$(ceph-mon ${cluster_option} --id ${id} --show-config-value mon_data) || exit 1
+mon_data=\"${mon_data}\"
+if [ -z \${mon_data} ]; then
+ mon_data=\$(ceph-mon ${cluster_option} --id ${id} --show-config-value mon_data) || exit 1
+fi
# if ceph-mon fails then the mon is probably not configured yet
test -e \$mon_data/done
",
@@ -178,7 +182,10 @@ test -e /etc/ceph/${cluster_name}.client.admin.keyring",
-> exec { $ceph_mkfs:
command => "/bin/true # comment to satisfy puppet syntax requirements
set -ex
-mon_data=\$(ceph-mon ${cluster_option} --id ${id} --show-config-value mon_data)
+mon_data=\"${mon_data}\"
+if [ -z \${mon_data} ]; then
+ mon_data=\$(ceph-mon ${cluster_option} --id ${id} --show-config-value mon_data)
+fi
if [ ! -d \$mon_data ] ; then
mkdir -p \$mon_data
if getent passwd ceph >/dev/null 2>&1; then
@@ -188,7 +195,8 @@ if [ ! -d \$mon_data ] ; then
--mkfs \
--id ${id} \
--keyring ${keyring_path} \
- ${fsid_option} ; then
+ ${fsid_option} \
+ --mon-data \$mon_data ; then
touch \$mon_data/done \$mon_data/${init} \$mon_data/keyring
chown -h ceph:ceph \$mon_data/done \$mon_data/${init} \$mon_data/keyring
else
@@ -199,7 +207,8 @@ if [ ! -d \$mon_data ] ; then
--mkfs \
--id ${id} \
--keyring ${keyring_path} \
- ${fsid_option} ; then
+ ${fsid_option} \
+ --mon-data \$mon_data ; then
touch \$mon_data/done \$mon_data/${init} \$mon_data/keyring
else
rm -fr \$mon_data
@@ -209,7 +218,10 @@ fi
",
unless => "/bin/true # comment to satisfy puppet syntax requirements
set -ex
-mon_data=\$(ceph-mon ${cluster_option} --id ${id} --show-config-value mon_data)
+mon_data=\"${mon_data}\"
+if [ -z \${mon_data} ]; then
+ mon_data=\$(ceph-mon ${cluster_option} --id ${id} --show-config-value mon_data)
+fi
test -d \$mon_data
",
logoutput => true,
@@ -244,13 +256,19 @@ test ! -e ${keyring_path}
-> exec { "remove-mon-${id}":
command => "/bin/true # comment to satisfy puppet syntax requirements
set -ex
-mon_data=\$(ceph-mon ${cluster_option} --id ${id} --show-config-value mon_data)
+mon_data=\"${mon_data}\"
+if [ -z \${mon_data} ]; then
+ mon_data=\$(ceph-mon ${cluster_option} --id ${id} --show-config-value mon_data)
+fi
rm -fr \$mon_data
",
unless => "/bin/true # comment to satisfy puppet syntax requirements
set -ex
which ceph-mon || exit 0 # if ceph-mon is not available we already uninstalled ceph and there is nothing to do
-mon_data=\$(ceph-mon ${cluster_option} --id ${id} --show-config-value mon_data)
+mon_data=\"${mon_data}\"
+if [ -z \${mon_data} ]; then
+ mon_data=\$(ceph-mon ${cluster_option} --id ${id} --show-config-value mon_data)
+fi
test ! -d \$mon_data
",
logoutput => true,
@@ -263,3 +281,4 @@ test ! -d \$mon_data
fail('Ensure on MON must be either present or absent')
}
}
+
--
2.34.1

View File

@ -13,3 +13,4 @@
0014-Adjust-puppet-ceph-dependency-requirements.patch
0015-Fix-the-unless-condition-of-ceph-osd-prepare.patch
0016-Add-ms_bind_ipv4-option-to-ceph-paremeters.patch
0017-Add-mon_data-parameter.patch