Ceph Logs Improvements
Thinking about better tracking the Ceph Process States logs, we have added some crucial information like PID, parental PID, and the name of the parental process. Also, some checking status INFO logs have been increased to DEBUG level. yyyy-MM-dd HH:mm:ss.SSSSSS <script_path> <prefix> <log_level>: <message> where, prefix = <pid_subshell> <ppid_name>[<ppid>] <name|optional> Test Plan: PASS: Checking the new logs pattern after using a new build. PASS: Testing different scenarios and trying to track related SM actions with logs in the ceph-process-states.log file. Closes-Bug: 2063123 Change-Id: I218ae2316218d9548cd1b43d645ddfb338fd1bce Signed-off-by: Hediberto Cavalcante da Silva <hediberto.cavalcantedasilva@windriver.com>
This commit is contained in:
parent
bd90e442ac
commit
15b6f23511
@ -104,13 +104,35 @@ STUCK_PEERING_RESTART_THRESH=180 # Restart OSD if stuck peering is longer
|
||||
|
||||
LOG_LEVEL=NORMAL # DEBUG
|
||||
|
||||
# Log Management
|
||||
# Adding PID and PPID informations
|
||||
log () {
|
||||
local name=""
|
||||
local log_level="$1"
|
||||
# Checking if the first parameter is not a log level
|
||||
if grep -q -v ${log_level} <<< "INFO DEBUG WARN ERROR"; then
|
||||
name=" ($1)";
|
||||
log_level="$2"
|
||||
shift
|
||||
fi
|
||||
|
||||
shift
|
||||
|
||||
local message="$@"
|
||||
# prefix = <pid_subshell> <ppid_name>[<ppid>] <name|optional>
|
||||
local prefix="${BASHPID} $(cat /proc/${PPID}/comm)[${PPID}]${name}"
|
||||
# yyyy-MM-dd HH:mm:ss.SSSSSS /etc/init.d/ceph <prefix> <log_level>: <message>
|
||||
wlog "${prefix}" "${log_level}" "${message}"
|
||||
return 0
|
||||
}
|
||||
|
||||
save_proc_state() {
|
||||
# Set the state of a process in the state machine and store it in a file
|
||||
local name=$1
|
||||
local state=$2
|
||||
if [ "$state" != "$ST_STARTUP" ] && [ "$state" != "$ST_OPER" ] && \
|
||||
[ "$state" != "$ST_HANGED" ] && [ "$state" != "$ST_STOPPED" ]; then
|
||||
wlog $name "ERROR" "State $state is invalid, resetting to $ST_STARTUP" print_trace
|
||||
log $name "ERROR" "State $state is invalid, resetting to $ST_STARTUP" print_trace
|
||||
state=$ST_STARTUP
|
||||
fi
|
||||
echo "$state" > ${DATA_PATH}/.${name}_state
|
||||
@ -125,7 +147,7 @@ load_proc_state() {
|
||||
fi
|
||||
if [ "$state" != "$ST_STARTUP" ] && [ "$state" != "$ST_OPER" ] && \
|
||||
[ "$state" != "$ST_HANGED" ] && [ "$state" != "$ST_STOPPED" ]; then
|
||||
wlog $name "ERROR" "State $state is invalid, resetting to $ST_STARTUP" print_trace
|
||||
log $name "ERROR" "State $state is invalid, resetting to $ST_STARTUP" print_trace
|
||||
state=$ST_STARTUP
|
||||
save_proc_state $name $state
|
||||
fi
|
||||
@ -142,7 +164,7 @@ save_proc_startup_ok() {
|
||||
save_proc_state $name $ST_STARTUP
|
||||
|
||||
# Save the time when a process was started
|
||||
wlog $name INFO "Process $ST_STARTUP successfully, waiting for it to become $ST_OPER"
|
||||
log $name INFO "Process $ST_STARTUP successfully, waiting for it to become $ST_OPER"
|
||||
echo $(date +%s) > ${DATA_PATH}/.${name}_start_time
|
||||
}
|
||||
|
||||
@ -177,7 +199,7 @@ get_duration(){
|
||||
|
||||
# Check that we have a filename
|
||||
if [ ! -f $record ]; then
|
||||
wlog $name ERROR "Failed to compute duration, time was never stored in $record!" print_trace
|
||||
log $name ERROR "Failed to compute duration, time was never stored in $record!" print_trace
|
||||
echo "-1"; return
|
||||
fi
|
||||
|
||||
@ -185,7 +207,7 @@ get_duration(){
|
||||
local start_time=$(cat $record)
|
||||
re="^[0-9]+$"
|
||||
if ! [[ "$start_time" =~ $re ]] ; then
|
||||
wlog $name ERROR "Recorded time '$start_time' is not a number!" print_trace
|
||||
log $name ERROR "Recorded time '$start_time' is not a number!" print_trace
|
||||
echo "-1"; return
|
||||
fi
|
||||
|
||||
@ -193,7 +215,7 @@ get_duration(){
|
||||
local now=$(date +%s)
|
||||
local duration=$((now-start_time))
|
||||
if [ "$duration" -lt 0 ]; then
|
||||
wlog $name ERROR "Duration less than 0!" print_trace
|
||||
log $name ERROR "Duration less than 0!" print_trace
|
||||
echo "-1"; return
|
||||
fi
|
||||
echo $duration
|
||||
@ -202,14 +224,14 @@ get_duration(){
|
||||
get_proc_run_time() {
|
||||
local name=$1
|
||||
local time=$(get_duration $name ${DATA_PATH}/.${name}_start_time)
|
||||
wlog $name DEBUG ">>> process running for: ${time}s"
|
||||
log $name DEBUG ">>> process running for: ${time}s"
|
||||
echo $time
|
||||
}
|
||||
|
||||
get_proc_down_time() {
|
||||
local name=$1
|
||||
local time=$(get_duration $name ${DATA_PATH}/.${name}_down_time)
|
||||
wlog $name DEBUG ">>> process down for: ${time}s"
|
||||
log $name DEBUG ">>> process down for: ${time}s"
|
||||
echo $time
|
||||
}
|
||||
|
||||
@ -224,19 +246,19 @@ run_state_machine() {
|
||||
|
||||
local state=$(load_proc_state $name)
|
||||
|
||||
wlog $name "DEBUG" ">>> state: $state"
|
||||
log $name "DEBUG" ">>> state: $state"
|
||||
# state machine
|
||||
if [ "$state" = "$ST_STARTUP" ]; then
|
||||
wlog $name "DEBUG" ">>> status: $status"
|
||||
log $name "DEBUG" ">>> status: $status"
|
||||
if [ "$status" = $UP ]; then
|
||||
save_proc_state $name $ST_OPER
|
||||
wlog $name "INFO" "Process is OPERATIONAL"
|
||||
log $name "INFO" "Process is OPERATIONAL"
|
||||
echo $ST_OPER; return
|
||||
else
|
||||
# the process should be 'up' in $WAIT_FOR_OSD_OPERATIONAL seconds!
|
||||
if [ $(get_proc_run_time $name) -gt $wait_for_operational ]; then
|
||||
# process hung!
|
||||
wlog $name "ERROR" "Process failed to go up in ${wait_for_operational}s after start, reporting it as $ST_HANGED!"
|
||||
log $name "ERROR" "Process failed to go up in ${wait_for_operational}s after start, reporting it as $ST_HANGED!"
|
||||
save_proc_state $name $ST_HANGED
|
||||
echo $ST_HANGED; return
|
||||
fi
|
||||
@ -244,20 +266,20 @@ run_state_machine() {
|
||||
elif [ "$state" = "$ST_OPER" ]; then
|
||||
if [ "$status" = "$DOWN" ]; then
|
||||
if [ $(load_proc_status $name) = $UP ];then
|
||||
wlog $name "WARN" "Process went down!"
|
||||
log $name "WARN" "Process went down!"
|
||||
save_proc_status $name $DOWN
|
||||
fi
|
||||
# if a process is down we don't report it as hung for a while
|
||||
# this should avoid status flapping
|
||||
if [ $(get_proc_down_time $name) -gt $wait_for_down_confirm ]; then
|
||||
# the process is down for a long time, report it as hung!
|
||||
wlog $name "ERROR" "Process went down for more than ${wait_for_down_confirm}s, reporting it as $ST_HANGED"
|
||||
log $name "ERROR" "Process went down for more than ${wait_for_down_confirm}s, reporting it as $ST_HANGED"
|
||||
save_proc_state $name $ST_HANGED
|
||||
echo $ST_HANGED; return
|
||||
fi
|
||||
elif [ "$status" = "$UP" ]; then
|
||||
if [ $(load_proc_status $name) = $DOWN ]; then
|
||||
wlog $name "WARN" "Process went up, flapping status or busy process?"
|
||||
log $name "WARN" "Process went up, flapping status or busy process?"
|
||||
save_proc_status $name $UP
|
||||
return
|
||||
fi
|
||||
@ -279,13 +301,13 @@ execute_ceph_cmd() {
|
||||
errcode=$?
|
||||
set +o pipefail
|
||||
if [ -z "$output" ] && [ $errcode -eq 124 ]; then # 'timeout' returns 124 when timing out
|
||||
wlog $name "WARN" "Ceph cluster failed to respond in ${WAIT_FOR_CMD}s when running: $cmd"
|
||||
log $name "WARN" "Ceph cluster failed to respond in ${WAIT_FOR_CMD}s when running: $cmd"
|
||||
CEPH_FAILURE="true"
|
||||
echo ""; return 1
|
||||
fi
|
||||
output=$(cat $DATA_PATH/.ceph_cmd_out)
|
||||
if [ -z "$output" ] || [ $errcode -ne 0 ]; then
|
||||
wlog $name "WARN" "Error executing: $cmd errorcode: $errcode output: $output"
|
||||
log $name "WARN" "Error executing: $cmd errorcode: $errcode output: $output"
|
||||
echo ""; return 1
|
||||
fi
|
||||
echo "$output"; return $errcode
|
||||
@ -299,7 +321,7 @@ is_process_hung() {
|
||||
|
||||
# Abort if we had previous errors with Ceph
|
||||
if [ "$CEPH_FAILURE" = "true" ]; then
|
||||
wlog $name "WARN" "Ceph cluster is marked as failed, aborting hang check"
|
||||
log $name "WARN" "Ceph cluster is marked as failed, aborting hang check"
|
||||
echo "false"; return
|
||||
fi
|
||||
|
||||
@ -307,7 +329,7 @@ is_process_hung() {
|
||||
if [ -z "$CEPH_HEALTH_DETAIL" ]; then
|
||||
execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail"
|
||||
if [ $? -ne 0 ]; then
|
||||
wlog $name "WARN" "Aborting hang check"
|
||||
log $name "WARN" "Aborting hang check"
|
||||
echo "false"; return
|
||||
fi
|
||||
fi
|
||||
@ -318,7 +340,7 @@ is_process_hung() {
|
||||
# Note this can be done with: 'ceph osd set noup; ceph osd down <osd.id>'
|
||||
$(echo "$CEPH_HEALTH_DETAIL" | grep -q "noup.*set")
|
||||
if [ $? -eq 0 ]; then
|
||||
wlog $name "WARN" "Ceph 'noup' flag is set, aborting hang check"
|
||||
log $name "WARN" "Ceph 'noup' flag is set, aborting hang check"
|
||||
echo "false"; return
|
||||
fi
|
||||
|
||||
@ -327,7 +349,7 @@ is_process_hung() {
|
||||
if [ -z "$CEPH_OSD_TREE" ]; then
|
||||
execute_ceph_cmd CEPH_OSD_TREE $name "ceph osd tree"
|
||||
if [ $? -ne 0 ]; then
|
||||
wlog $name "WARN" "Ceph cmd exec failed, aborting hang check"
|
||||
log $name "WARN" "Ceph cmd exec failed, aborting hang check"
|
||||
echo "false"; return
|
||||
fi
|
||||
fi
|
||||
@ -349,8 +371,8 @@ is_process_hung() {
|
||||
fi
|
||||
|
||||
|
||||
# Check if a Monitor is hung
|
||||
elif [ "$type" = "mon" ]; then
|
||||
# Check if a Monitor is hung
|
||||
elif [ "$type" = "mon" ]; then
|
||||
# Get monitor status info
|
||||
local mon_status=$UP
|
||||
echo "$CEPH_HEALTH_DETAIL" | grep -q -e "^[[:space:]]*$name.*down"
|
||||
@ -366,10 +388,13 @@ is_process_hung() {
|
||||
echo "false"; return
|
||||
fi
|
||||
|
||||
else
|
||||
wlog $name "WARN" "Unknown process type: $type"
|
||||
fi
|
||||
echo "false"
|
||||
elif [ "$type" = "mds" ]; then
|
||||
echo "false"; return
|
||||
|
||||
else
|
||||
log $name "WARN" "Unknown process type: $type"
|
||||
fi
|
||||
echo "false"
|
||||
}
|
||||
|
||||
osd_has_blocked_ops() {
|
||||
@ -377,7 +402,7 @@ osd_has_blocked_ops() {
|
||||
|
||||
# Abort if we had previous errors with Ceph
|
||||
if [ "$CEPH_FAILURE" = "true" ]; then
|
||||
wlog $name "WARN" "Ceph cluster is marked as failed, aborting blocked ops check"
|
||||
log $name "WARN" "Ceph cluster is marked as failed, aborting blocked ops check"
|
||||
echo "false"; return
|
||||
fi
|
||||
|
||||
@ -387,7 +412,7 @@ osd_has_blocked_ops() {
|
||||
if [ -z "$CEPH_HEALTH_DETAIL" ]; then
|
||||
execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail"
|
||||
if [ $? -ne 0 ]; then
|
||||
wlog $name "WARN" "Aborting blocked ops check"
|
||||
log $name "WARN" "Aborting blocked ops check"
|
||||
echo "false"; return
|
||||
fi
|
||||
fi
|
||||
@ -396,7 +421,7 @@ osd_has_blocked_ops() {
|
||||
# Note this can be done with: 'ceph osd set noup; ceph osd down <osd.id>'
|
||||
$(echo "$CEPH_HEALTH_DETAIL" | grep -q "noup.*set")
|
||||
if [ $? -eq 0 ]; then
|
||||
wlog $name "WARN" "Ceph 'noup' flag is set, aborting blocked ops check"
|
||||
log $name "WARN" "Ceph 'noup' flag is set, aborting blocked ops check"
|
||||
echo "false"; return
|
||||
fi
|
||||
|
||||
@ -406,7 +431,7 @@ osd_has_blocked_ops() {
|
||||
if [ -z "$CEPH_OSD_TREE" ]; then
|
||||
execute_ceph_cmd CEPH_OSD_TREE $name "ceph osd tree"
|
||||
if [ $? -ne 0 ]; then
|
||||
wlog $name "WARN" "Ceph cmd exec failed, aborting blocked ops check"
|
||||
log $name "WARN" "Ceph cmd exec failed, aborting blocked ops check"
|
||||
echo "false"; return
|
||||
fi
|
||||
fi
|
||||
@ -418,7 +443,7 @@ osd_has_blocked_ops() {
|
||||
local blocked_time=$(echo "$CEPH_HEALTH_DETAIL" | grep $name | sed -rn 's/.*ops are blocked > ([[:digit:]]*).*/\1/p')
|
||||
[[ "$blocked_time" == "" ]] && blocked_time=0
|
||||
if [ $blocked_time -gt $BLOCKED_OPS_RESTART_THRESH ]; then
|
||||
wlog $name "WARN" "Detected blocked operations for $blocked_time seconds"
|
||||
log $name "WARN" "Detected blocked operations for $blocked_time seconds"
|
||||
echo "true"; return
|
||||
else
|
||||
echo "false"; return
|
||||
@ -432,7 +457,7 @@ osd_has_stuck_peering() {
|
||||
|
||||
# Abort if we had previous errors with Ceph
|
||||
if [ "$CEPH_FAILURE" = "true" ]; then
|
||||
wlog $name "WARN" "Ceph cluster is marked as failed, aborting stuck peering check"
|
||||
log $name "WARN" "Ceph cluster is marked as failed, aborting stuck peering check"
|
||||
echo "false"; return
|
||||
fi
|
||||
|
||||
@ -442,7 +467,7 @@ osd_has_stuck_peering() {
|
||||
if [ -z "$CEPH_HEALTH_DETAIL" ]; then
|
||||
execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail"
|
||||
if [ $? -ne 0 ]; then
|
||||
wlog $name "WARN" "Aborting stuck peering check"
|
||||
log $name "WARN" "Aborting stuck peering check"
|
||||
echo "false"; return
|
||||
fi
|
||||
fi
|
||||
@ -451,7 +476,7 @@ osd_has_stuck_peering() {
|
||||
# Note this can be done with: 'ceph osd set nodown; /etc/init.d/ceph stop <osd.id>'
|
||||
$(echo "$CEPH_HEALTH_DETAIL" | grep -q "nodown.*set")
|
||||
if [ $? -eq 0 ]; then
|
||||
wlog $name "WARN" "Ceph 'nodown' flag is set, aborting stuck peering check"
|
||||
log $name "WARN" "Ceph 'nodown' flag is set, aborting stuck peering check"
|
||||
echo "false"; return
|
||||
fi
|
||||
|
||||
@ -487,7 +512,7 @@ osd_has_stuck_peering() {
|
||||
fi
|
||||
|
||||
if [ $blocked_time -gt $STUCK_PEERING_RESTART_THRESH ]; then
|
||||
wlog $name "WARN" "Detected stuck peering for $blocked_time seconds"
|
||||
log $name "WARN" "Detected stuck peering for $blocked_time seconds"
|
||||
echo "true"; return
|
||||
else
|
||||
echo "false"; return
|
||||
@ -613,7 +638,7 @@ stop_daemon() {
|
||||
## command line options
|
||||
options=
|
||||
IFS=" " read -r -a args <<< "$@"
|
||||
wlog "-" INFO "$@"
|
||||
log INFO "$@"
|
||||
|
||||
OPTS=$(${GETOPT} -n 'init-ceph' -o 'hvam:c:' -l 'help,verbose,valgrind,novalgrind,allhosts,restart,norestart,btrfs,nobtrfs,fsmount,nofsmount,btrfsumount,fsumount,conf:,cluster:,hostname:' -- "${args[@]}")
|
||||
if [ $? != 0 ]
|
||||
@ -883,7 +908,7 @@ for name in $what; do
|
||||
while [[ $(date '+%s') -lt "${deadline}" ]]; do
|
||||
systemd_coredump_pid=$(pgrep -f "systemd-coredump.*ceph-${type}")
|
||||
[[ -z "${systemd_coredump_pid}" ]] && break
|
||||
wlog $name "INFO" "systemd-coredump ceph-${type} in progress: pid ${systemd_coredump_pid}"
|
||||
log $name "INFO" "systemd-coredump ceph-${type} in progress: pid ${systemd_coredump_pid}"
|
||||
sleep 1
|
||||
done
|
||||
|
||||
@ -1034,16 +1059,16 @@ EOF
|
||||
get_conf post_stop "" "post stop command"
|
||||
[ -n "$pre_stop" ] && do_cmd "$pre_stop"
|
||||
|
||||
wlog $name "INFO" "Stopping process"
|
||||
log $name "INFO" "Stopping process"
|
||||
|
||||
if [ $(load_proc_state $name) != "$ST_HANGED" ]; then
|
||||
stop_daemon $name ceph-$type $pid_file
|
||||
else
|
||||
# first try to gracefully close process, this should be fast if
|
||||
# its threads still respond to the TERM signal
|
||||
wlog $name "DEBUG" ">>> Sending term signal"
|
||||
log $name "DEBUG" ">>> Sending term signal"
|
||||
stop_daemon $name ceph-$type $pid_file TERM "" 5
|
||||
wlog $name "DEBUG" ">>> Sending kill signal"
|
||||
log $name "DEBUG" ">>> Sending kill signal"
|
||||
# then just kill it
|
||||
stop_daemon $name ceph-$type $pid_file KILL
|
||||
fi
|
||||
@ -1057,13 +1082,13 @@ EOF
|
||||
CMD_OUTPUT=''
|
||||
execute_ceph_cmd CMD_OUTPUT "Ceph Status" "ceph -s"
|
||||
if [ $? == 0 ]; then
|
||||
wlog "${name}" "INFO" "Flushing journal"
|
||||
log "${name}" "INFO" "Flushing journal"
|
||||
$(/usr/bin/ceph-osd -i $id --flush-journal) &
|
||||
else
|
||||
wlog "${name}" "INFO" "Skipping journal flush"
|
||||
log "${name}" "INFO" "Skipping journal flush"
|
||||
fi
|
||||
fi
|
||||
wlog $name "INFO" "Process stopped, setting state to $ST_STOPPED"
|
||||
log $name "INFO" "Process stopped, setting state to $ST_STOPPED"
|
||||
save_proc_state $name $ST_STOPPED
|
||||
if [ $dofsumount -eq 1 ] && [ -n "$fs_devs" ]; then
|
||||
echo Unmounting OSD volume on $host:$fs_path
|
||||
@ -1084,7 +1109,7 @@ EOF
|
||||
if [ $? -eq 0 ]; then
|
||||
state=$(echo "$CEPH_DAEMON_STATUS" | awk -F ':' '/state/{gsub(/[,[:space:]]/, "", $2);print $2}')
|
||||
if [ "$state" != "active" ]; then
|
||||
wlog $name "INFO" "$name has the following state: $state"
|
||||
log $name "INFO" "$name has the following state: $state"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
@ -86,16 +86,38 @@ if [ ! -z $ARGS ]; then
|
||||
args+=("${new_args[@]}")
|
||||
fi
|
||||
|
||||
# Log Management
|
||||
# Adding PID and PPID informations
|
||||
log () {
|
||||
local name=""
|
||||
local log_level="$1"
|
||||
# Checking if the first parameter is not a log level
|
||||
if grep -q -v ${log_level} <<< "INFO DEBUG WARN ERROR"; then
|
||||
name=" ($1)";
|
||||
log_level="$2"
|
||||
shift
|
||||
fi
|
||||
|
||||
shift
|
||||
|
||||
local message="$@"
|
||||
# prefix = <pid_subshell> <ppid_name>[<ppid>] <name|optional>
|
||||
local prefix="${BASHPID} $(cat /proc/${PPID}/comm)[${PPID}]${name}"
|
||||
# yyyy-MM-dd HH:mm:ss.SSSSSS /etc/init.d/ceph-init-wrapper <prefix> <log_level>: <message>
|
||||
wlog "${prefix}" "${log_level}" "${message}"
|
||||
return 0
|
||||
}
|
||||
|
||||
# Verify if drbd-cephmon role is primary, checking the output of 'drbdadm role'
|
||||
# Return 0 on success and 1 if drbd-cephmon is not primary
|
||||
is_drbd_cephmon_primary ()
|
||||
{
|
||||
drbdadm role drbd-cephmon | grep -q 'Primary/'
|
||||
if [ $? -eq 0 ]; then
|
||||
wlog "-" INFO "drbd-cephmon role is Primary"
|
||||
log INFO "drbd-cephmon role is Primary"
|
||||
return 0
|
||||
fi
|
||||
wlog "-" INFO "drbd-cephmon role is NOT Primary"
|
||||
log INFO "drbd-cephmon role is NOT Primary"
|
||||
return 1
|
||||
}
|
||||
|
||||
@ -105,10 +127,10 @@ is_drbd_cephmon_mounted ()
|
||||
{
|
||||
findmnt -no SOURCE "${CEPH_MON_LIB_PATH}" | grep -q drbd
|
||||
if [ $? -eq 0 ]; then
|
||||
wlog "-" INFO "drbd-cephmon partition is mounted"
|
||||
log INFO "drbd-cephmon partition is mounted"
|
||||
return 0
|
||||
fi
|
||||
wlog "-" INFO "drbd-cephmon partition is NOT mounted"
|
||||
log INFO "drbd-cephmon partition is NOT mounted"
|
||||
return 1
|
||||
}
|
||||
|
||||
@ -129,7 +151,7 @@ has_all_network_no_carrier()
|
||||
|
||||
# Check if all networks have no carrier, meaning the other host is down
|
||||
if [ "${oam_carrier}" -eq 0 ] && [ "${cluster_host_carrier}" -eq 0 ] && [ "${mgmt_carrier}" -eq 0 ]; then
|
||||
wlog "-" INFO "No carrier detected from all network interfaces"
|
||||
log INFO "No carrier detected from all network interfaces"
|
||||
return 0
|
||||
fi
|
||||
return 1
|
||||
@ -142,10 +164,10 @@ has_mgmt_network_carrier()
|
||||
# If no-carrier message is detected, then the interface has no physical link
|
||||
ip link show "${management_interface}" | grep NO-CARRIER
|
||||
if [ $? -eq 0 ]; then
|
||||
wlog "-" INFO "management interface '${management_interface}' has NO-CARRIER, cannot start ceph mon"
|
||||
log INFO "Management Interface '${management_interface}' has NO-CARRIER, cannot start ceph-mon"
|
||||
return 1
|
||||
fi
|
||||
wlog "-" INFO "management interface '${management_interface}' is working"
|
||||
log "-" DEBUG "Management Interface '${management_interface}' is working"
|
||||
return 0
|
||||
}
|
||||
|
||||
@ -168,7 +190,7 @@ can_start_ceph_mon ()
|
||||
done
|
||||
|
||||
if [ ${times} -eq 0 ]; then
|
||||
wlog "-" ERROR "drbd-cephmon is not primary, cannot start ceph mon"
|
||||
log ERROR "drbd-cephmon is NOT Primary, cannot start ceph-mon"
|
||||
return 1
|
||||
fi
|
||||
|
||||
@ -184,7 +206,7 @@ can_start_ceph_mon ()
|
||||
done
|
||||
|
||||
if [ ${times} -eq 0 ]; then
|
||||
wlog "-" ERROR "drbd-cephmon is not mounted, cannot start ceph mon"
|
||||
log ERROR "drbd-cephmon is NOT mounted, cannot start ceph-mon"
|
||||
return 1
|
||||
fi
|
||||
|
||||
@ -201,12 +223,12 @@ with_service_lock ()
|
||||
# used for locking service actions
|
||||
(
|
||||
# Grab service locks
|
||||
wlog "-" INFO "Grab service locks"
|
||||
log INFO "Grab service locks"
|
||||
[[ "${target}" == *"mon"* ]] && flock ${LOCK_CEPH_MON_SERVICE_FD}
|
||||
[[ "${target}" == *"osd"* ]] && flock ${LOCK_CEPH_OSD_SERVICE_FD}
|
||||
|
||||
# Try to lock status with a timeout in case status is stuck
|
||||
wlog "-" INFO "Lock service status"
|
||||
log INFO "Lock service status"
|
||||
deadline=$((SECONDS + MAX_STATUS_TIMEOUT + 1))
|
||||
if [[ "${target}" == *"mon"* ]]; then
|
||||
flock --exclusive --timeout ${MONITOR_STATUS_TIMEOUT} ${LOCK_CEPH_MON_STATUS_FD}
|
||||
@ -221,7 +243,7 @@ with_service_lock ()
|
||||
# Close lock file descriptors so they are
|
||||
# not inherited by the spawned process then
|
||||
# run service action
|
||||
wlog "-" INFO "Run service action: $@"
|
||||
log INFO "Run service action: $@"
|
||||
"$@" {LOCK_CEPH_MON_SERVICE_FD}>&- \
|
||||
{LOCK_CEPH_MON_STATUS_FD}>&- \
|
||||
{LOCK_CEPH_OSD_SERVICE_FD}>&- \
|
||||
@ -250,7 +272,7 @@ start ()
|
||||
if [ "${service}" == "mon" ]; then
|
||||
can_start_ceph_mon
|
||||
if [ $? -ne 0 ]; then
|
||||
wlog "-" INFO "Ceph monitor is not ready to start because drbd-cephmon is not ready and mounted"
|
||||
log INFO "Ceph Monitor is not ready to start because drbd-cephmon is not ready and mounted"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
@ -262,33 +284,33 @@ start ()
|
||||
if [ "${system_mode}" == "duplex-direct" ]; then
|
||||
has_all_network_no_carrier
|
||||
if [ $? -eq 0 ]; then
|
||||
wlog "-" INFO "All network interfaces are not functional, considering the other host is down. Let Ceph start."
|
||||
log INFO "All network interfaces are not functional, considering the other host is down. Let Ceph start."
|
||||
else
|
||||
# Else AIO-DX Direct mgmt network is NOT functional
|
||||
wlog "-" INFO "Mgmt network is not functional, defer starting Ceph processes until recovered"
|
||||
log INFO "Management Interface is not functional, defer starting Ceph processes until recovered"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
# Else AIO-DX mgmt network is NOT functional
|
||||
wlog "-" INFO "Mgmt network is not functional, defer starting Ceph processes until recovered"
|
||||
log INFO "Management Interface is not functional, defer starting Ceph processes until recovered"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# Start the service
|
||||
wlog "-" INFO "Ceph START ${service} command received"
|
||||
log INFO "Ceph START ${service} command received"
|
||||
with_service_lock "${service}" ${CEPH_SCRIPT} start ${service}
|
||||
wlog "-" INFO "Ceph START ${service} command finished."
|
||||
log INFO "Ceph START ${service} command finished."
|
||||
}
|
||||
|
||||
stop ()
|
||||
{
|
||||
local service="$1"
|
||||
|
||||
wlog "-" INFO "Ceph STOP $1 command received."
|
||||
log INFO "Ceph STOP $1 command received."
|
||||
with_service_lock "$1" ${CEPH_SCRIPT} stop $1
|
||||
wlog "-" INFO "Ceph STOP $1 command finished."
|
||||
log INFO "Ceph STOP $1 command finished."
|
||||
}
|
||||
|
||||
restart ()
|
||||
@ -297,10 +319,10 @@ restart ()
|
||||
# Ceph is not running on this node, return success
|
||||
exit 0
|
||||
fi
|
||||
wlog "-" INFO "Ceph RESTART $1 command received."
|
||||
log INFO "Ceph RESTART $1 command received."
|
||||
stop "$1"
|
||||
start "$1"
|
||||
wlog "-" INFO "Ceph RESTART $1 command finished."
|
||||
log INFO "Ceph RESTART $1 command finished."
|
||||
}
|
||||
|
||||
log_and_restart_blocked_osds ()
|
||||
@ -309,7 +331,7 @@ log_and_restart_blocked_osds ()
|
||||
local names=$1
|
||||
local message=$2
|
||||
for name in $names; do
|
||||
wlog $name "INFO" "$message"
|
||||
log $name "INFO" "$message"
|
||||
# Restart the daemons but release ceph mon and osd file descriptors
|
||||
${CEPH_SCRIPT} restart $name {LOCK_CEPH_MON_STATUS_FD}>&- {LOCK_CEPH_OSD_STATUS_FD}>&-
|
||||
done
|
||||
@ -325,17 +347,17 @@ log_and_kill_hung_procs ()
|
||||
get_conf run_dir "/var/run/ceph" "run dir"
|
||||
get_conf pid_file "$run_dir/$type.$id.pid" "pid file"
|
||||
pid=$(cat $pid_file)
|
||||
wlog $name "INFO" "Dealing with hung process (pid:$pid)"
|
||||
log $name "INFO" "Dealing with hung process (pid:$pid)"
|
||||
|
||||
# monitoring interval
|
||||
wlog $name "INFO" "Increasing log level"
|
||||
log $name "INFO" "Increasing log level"
|
||||
WAIT_FOR_CMD=10 execute_ceph_cmd ret $name "ceph daemon $name config set debug_$type 20/20"
|
||||
monitoring=$MONITORING_INTERVAL
|
||||
while [ $monitoring -gt 0 ]; do
|
||||
if [ $(($monitoring % $TRACE_LOOP_INTERVAL)) -eq 0 ]; then
|
||||
date=$(date "+%Y-%m-%d_%H-%M-%S")
|
||||
log_file="$LOG_PATH/hang_trace_${name}_${pid}_${date}.log"
|
||||
wlog $name "INFO" "Dumping stack trace to: $log_file"
|
||||
log $name "INFO" "Dumping stack trace to: $log_file"
|
||||
if grep -q "Debian" /etc/os-release; then
|
||||
$(eu-stack -p $pid >$log_file) &
|
||||
elif grep -q "CentOS" /etc/os-release; then
|
||||
@ -345,7 +367,7 @@ log_and_kill_hung_procs ()
|
||||
let monitoring-=1
|
||||
sleep 1
|
||||
done
|
||||
wlog $name "INFO" "Trigger core dump"
|
||||
log $name "INFO" "Trigger core dump"
|
||||
kill -ABRT $pid &>/dev/null
|
||||
rm -f $pid_file # process is dead, core dump is archiving, preparing for restart
|
||||
# Wait for pending systemd core dumps
|
||||
@ -354,7 +376,7 @@ log_and_kill_hung_procs ()
|
||||
while [[ $(date '+%s') -lt "${deadline}" ]]; do
|
||||
systemd_coredump_pid=$(pgrep -f "systemd-coredump.*${pid}.*ceph-${type}")
|
||||
[[ -z "${systemd_coredump_pid}" ]] && break
|
||||
wlog $name "INFO" "systemd-coredump ceph-${type} in progress: pid ${systemd_coredump_pid}"
|
||||
log $name "INFO" "systemd-coredump ceph-${type} in progress: pid ${systemd_coredump_pid}"
|
||||
sleep 2
|
||||
done
|
||||
kill -KILL $pid &>/dev/null
|
||||
@ -375,19 +397,19 @@ status ()
|
||||
has_mgmt_network_carrier
|
||||
if [ $? -eq 0 ]; then
|
||||
# Network is functional, continue
|
||||
wlog "-" INFO "mgmt network active..."
|
||||
log DEBUG "Management Interface active."
|
||||
else
|
||||
if [ "${system_mode}" == "duplex-direct" ]; then
|
||||
has_all_network_no_carrier
|
||||
if [ $? -ne 0 ]; then
|
||||
# Network is NOT functional, prevent split brain corruptions
|
||||
wlog "-" INFO "mgmt network inactive... stop OSDs to force a re-peering once the network has recovered"
|
||||
log INFO "Management Interface inactive. Stopping OSDs to force a re-peering once the network has recovered"
|
||||
stop "$1"
|
||||
exit 0
|
||||
fi
|
||||
else
|
||||
# Network is NOT functional, prevent split brain corruptions
|
||||
wlog "-" INFO "mgmt network inactive... stop OSDs to force a re-peering once the network has recovered"
|
||||
log INFO "Management Interface inactive. Stopping OSDs to force a re-peering once the network has recovered"
|
||||
stop "$1"
|
||||
exit 0
|
||||
fi
|
||||
@ -397,7 +419,7 @@ status ()
|
||||
if [ "$?" -ne 0 ]; then
|
||||
# Ceph cluster is not accessible. Don't panic, controller swact
|
||||
# may be in progress.
|
||||
wlog "-" INFO "Ceph is down, ignoring OSD status."
|
||||
log INFO "Ceph is DOWN, ignoring OSD status."
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
@ -425,7 +447,7 @@ status ()
|
||||
flock --shared ${LOCK_CEPH_OSD_STATUS_FD}
|
||||
fi
|
||||
|
||||
result=`${CEPH_SCRIPT} status $1 {LOCK_CEPH_MON_STATUS_FD}>&- {LOCK_CEPH_OSD_STATUS_FD}>&-`
|
||||
result=`log INFO "status $1"; ${CEPH_SCRIPT} status $1 {LOCK_CEPH_MON_STATUS_FD}>&- {LOCK_CEPH_OSD_STATUS_FD}>&-`
|
||||
RC=$?
|
||||
if [ "$RC" -ne 0 ]; then
|
||||
erred_procs=`echo "$result" | sort | uniq | awk ' /not running|dead|failed/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
|
||||
@ -488,13 +510,13 @@ status ()
|
||||
has_all_network_no_carrier
|
||||
if [ $? -ne 0 ]; then
|
||||
# Network is NOT functional, prevent split brain corruptions
|
||||
wlog "-" INFO "mgmt network inactive... stop MON to prevent localized operation"
|
||||
log INFO "Management Interface inactive. Stopping ceph-mon to prevent localized operation"
|
||||
stop "$1"
|
||||
exit 0
|
||||
fi
|
||||
else
|
||||
# Network is NOT functional, prevent split brain corruptions
|
||||
wlog "-" INFO "mgmt network inactive... stop MON to prevent localized operation"
|
||||
log INFO "Management Interface inactive. Stopping ceph-mon to prevent localized operation"
|
||||
stop "$1"
|
||||
exit 0
|
||||
fi
|
||||
|
Loading…
x
Reference in New Issue
Block a user