Merge "Fix ceph init script variable caching"
This commit is contained in:
commit
7ab2044d95
@ -290,29 +290,6 @@ run_state_machine() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
CEPH_FAILURE=""
|
|
||||||
execute_ceph_cmd() {
|
|
||||||
# execute a comand and in case it timeouts mark ceph as failed
|
|
||||||
local name=$1
|
|
||||||
local cmd=$2
|
|
||||||
local cmd="timeout $WAIT_FOR_CMD $cmd"
|
|
||||||
set -o pipefail
|
|
||||||
eval "$cmd >$DATA_PATH/.ceph_cmd_out"
|
|
||||||
errcode=$?
|
|
||||||
set +o pipefail
|
|
||||||
if [ -z "$output" ] && [ $errcode -eq 124 ]; then # 'timeout' returns 124 when timing out
|
|
||||||
log $name "WARN" "Ceph cluster failed to respond in ${WAIT_FOR_CMD}s when running: $cmd"
|
|
||||||
CEPH_FAILURE="true"
|
|
||||||
echo ""; return 1
|
|
||||||
fi
|
|
||||||
output=$(cat $DATA_PATH/.ceph_cmd_out)
|
|
||||||
if [ -z "$output" ] || [ $errcode -ne 0 ]; then
|
|
||||||
log $name "WARN" "Error executing: $cmd errorcode: $errcode output: $output"
|
|
||||||
echo ""; return 1
|
|
||||||
fi
|
|
||||||
echo "$output"; return $errcode
|
|
||||||
}
|
|
||||||
|
|
||||||
CEPH_OSD_TREE=""
|
CEPH_OSD_TREE=""
|
||||||
CEPH_HEALTH_DETAIL=""
|
CEPH_HEALTH_DETAIL=""
|
||||||
is_process_hung() {
|
is_process_hung() {
|
||||||
@ -322,7 +299,7 @@ is_process_hung() {
|
|||||||
# Abort if we had previous errors with Ceph
|
# Abort if we had previous errors with Ceph
|
||||||
if [ "$CEPH_FAILURE" = "true" ]; then
|
if [ "$CEPH_FAILURE" = "true" ]; then
|
||||||
log $name "WARN" "Ceph cluster is marked as failed, aborting hang check"
|
log $name "WARN" "Ceph cluster is marked as failed, aborting hang check"
|
||||||
echo "false"; return
|
return 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Cache Ceph Health for later use as calling Ceph takes time
|
# Cache Ceph Health for later use as calling Ceph takes time
|
||||||
@ -330,7 +307,7 @@ is_process_hung() {
|
|||||||
execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail"
|
execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail"
|
||||||
if [ $? -ne 0 ]; then
|
if [ $? -ne 0 ]; then
|
||||||
log $name "WARN" "Aborting hang check"
|
log $name "WARN" "Aborting hang check"
|
||||||
echo "false"; return
|
return 1
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@ -341,7 +318,7 @@ is_process_hung() {
|
|||||||
$(echo "$CEPH_HEALTH_DETAIL" | grep -q "noup.*set")
|
$(echo "$CEPH_HEALTH_DETAIL" | grep -q "noup.*set")
|
||||||
if [ $? -eq 0 ]; then
|
if [ $? -eq 0 ]; then
|
||||||
log $name "WARN" "Ceph 'noup' flag is set, aborting hang check"
|
log $name "WARN" "Ceph 'noup' flag is set, aborting hang check"
|
||||||
echo "false"; return
|
return 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Multiple OSD processes may be running, so we only run
|
# Multiple OSD processes may be running, so we only run
|
||||||
@ -350,7 +327,7 @@ is_process_hung() {
|
|||||||
execute_ceph_cmd CEPH_OSD_TREE $name "ceph osd tree"
|
execute_ceph_cmd CEPH_OSD_TREE $name "ceph osd tree"
|
||||||
if [ $? -ne 0 ]; then
|
if [ $? -ne 0 ]; then
|
||||||
log $name "WARN" "Ceph cmd exec failed, aborting hang check"
|
log $name "WARN" "Ceph cmd exec failed, aborting hang check"
|
||||||
echo "false"; return
|
return 1
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@ -365,9 +342,9 @@ is_process_hung() {
|
|||||||
local state=$(run_state_machine $name $type $osd_status \
|
local state=$(run_state_machine $name $type $osd_status \
|
||||||
$WAIT_FOR_OSD_OPERATIONAL $WAIT_FOR_OSD_DOWN_CONFIRM)
|
$WAIT_FOR_OSD_OPERATIONAL $WAIT_FOR_OSD_DOWN_CONFIRM)
|
||||||
if [ "$state" = "$ST_HANGED" ]; then
|
if [ "$state" = "$ST_HANGED" ]; then
|
||||||
echo "true"; return
|
return 0
|
||||||
else
|
else
|
||||||
echo "false"; return
|
return 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
||||||
@ -383,18 +360,18 @@ is_process_hung() {
|
|||||||
local state=$(run_state_machine $name $type $mon_status \
|
local state=$(run_state_machine $name $type $mon_status \
|
||||||
$WAIT_FOR_MON_OPERATIONAL $WAIT_FOR_MON_DOWN_CONFIRM)
|
$WAIT_FOR_MON_OPERATIONAL $WAIT_FOR_MON_DOWN_CONFIRM)
|
||||||
if [ "$state" = "$ST_HANGED" ]; then
|
if [ "$state" = "$ST_HANGED" ]; then
|
||||||
echo "true"; return
|
return 0
|
||||||
else
|
else
|
||||||
echo "false"; return
|
return 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
elif [ "$type" = "mds" ]; then
|
elif [ "$type" = "mds" ]; then
|
||||||
echo "false"; return
|
return 1
|
||||||
|
|
||||||
else
|
else
|
||||||
log $name "WARN" "Unknown process type: $type"
|
log $name "WARN" "Unknown process type: $type"
|
||||||
fi
|
fi
|
||||||
echo "false"
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
osd_has_blocked_ops() {
|
osd_has_blocked_ops() {
|
||||||
@ -403,7 +380,7 @@ osd_has_blocked_ops() {
|
|||||||
# Abort if we had previous errors with Ceph
|
# Abort if we had previous errors with Ceph
|
||||||
if [ "$CEPH_FAILURE" = "true" ]; then
|
if [ "$CEPH_FAILURE" = "true" ]; then
|
||||||
log $name "WARN" "Ceph cluster is marked as failed, aborting blocked ops check"
|
log $name "WARN" "Ceph cluster is marked as failed, aborting blocked ops check"
|
||||||
echo "false"; return
|
return 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Cache Ceph Health for later use as calling Ceph takes time This is
|
# Cache Ceph Health for later use as calling Ceph takes time This is
|
||||||
@ -413,7 +390,7 @@ osd_has_blocked_ops() {
|
|||||||
execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail"
|
execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail"
|
||||||
if [ $? -ne 0 ]; then
|
if [ $? -ne 0 ]; then
|
||||||
log $name "WARN" "Aborting blocked ops check"
|
log $name "WARN" "Aborting blocked ops check"
|
||||||
echo "false"; return
|
return 1
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@ -422,7 +399,7 @@ osd_has_blocked_ops() {
|
|||||||
$(echo "$CEPH_HEALTH_DETAIL" | grep -q "noup.*set")
|
$(echo "$CEPH_HEALTH_DETAIL" | grep -q "noup.*set")
|
||||||
if [ $? -eq 0 ]; then
|
if [ $? -eq 0 ]; then
|
||||||
log $name "WARN" "Ceph 'noup' flag is set, aborting blocked ops check"
|
log $name "WARN" "Ceph 'noup' flag is set, aborting blocked ops check"
|
||||||
echo "false"; return
|
return 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Multiple OSD processes may be running, so we only run 'ceph osd tree' once
|
# Multiple OSD processes may be running, so we only run 'ceph osd tree' once
|
||||||
@ -432,7 +409,7 @@ osd_has_blocked_ops() {
|
|||||||
execute_ceph_cmd CEPH_OSD_TREE $name "ceph osd tree"
|
execute_ceph_cmd CEPH_OSD_TREE $name "ceph osd tree"
|
||||||
if [ $? -ne 0 ]; then
|
if [ $? -ne 0 ]; then
|
||||||
log $name "WARN" "Ceph cmd exec failed, aborting blocked ops check"
|
log $name "WARN" "Ceph cmd exec failed, aborting blocked ops check"
|
||||||
echo "false"; return
|
return 1
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@ -444,9 +421,9 @@ osd_has_blocked_ops() {
|
|||||||
[[ "$blocked_time" == "" ]] && blocked_time=0
|
[[ "$blocked_time" == "" ]] && blocked_time=0
|
||||||
if [ $blocked_time -gt $BLOCKED_OPS_RESTART_THRESH ]; then
|
if [ $blocked_time -gt $BLOCKED_OPS_RESTART_THRESH ]; then
|
||||||
log $name "WARN" "Detected blocked operations for $blocked_time seconds"
|
log $name "WARN" "Detected blocked operations for $blocked_time seconds"
|
||||||
echo "true"; return
|
return 0
|
||||||
else
|
else
|
||||||
echo "false"; return
|
return 1
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
@ -458,7 +435,7 @@ osd_has_stuck_peering() {
|
|||||||
# Abort if we had previous errors with Ceph
|
# Abort if we had previous errors with Ceph
|
||||||
if [ "$CEPH_FAILURE" = "true" ]; then
|
if [ "$CEPH_FAILURE" = "true" ]; then
|
||||||
log $name "WARN" "Ceph cluster is marked as failed, aborting stuck peering check"
|
log $name "WARN" "Ceph cluster is marked as failed, aborting stuck peering check"
|
||||||
echo "false"; return
|
return 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Cache Ceph Health for later use as calling Ceph takes time This is
|
# Cache Ceph Health for later use as calling Ceph takes time This is
|
||||||
@ -468,7 +445,7 @@ osd_has_stuck_peering() {
|
|||||||
execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail"
|
execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail"
|
||||||
if [ $? -ne 0 ]; then
|
if [ $? -ne 0 ]; then
|
||||||
log $name "WARN" "Aborting stuck peering check"
|
log $name "WARN" "Aborting stuck peering check"
|
||||||
echo "false"; return
|
return 1
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@ -477,7 +454,7 @@ osd_has_stuck_peering() {
|
|||||||
$(echo "$CEPH_HEALTH_DETAIL" | grep -q "nodown.*set")
|
$(echo "$CEPH_HEALTH_DETAIL" | grep -q "nodown.*set")
|
||||||
if [ $? -eq 0 ]; then
|
if [ $? -eq 0 ]; then
|
||||||
log $name "WARN" "Ceph 'nodown' flag is set, aborting stuck peering check"
|
log $name "WARN" "Ceph 'nodown' flag is set, aborting stuck peering check"
|
||||||
echo "false"; return
|
return 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
||||||
@ -513,9 +490,9 @@ osd_has_stuck_peering() {
|
|||||||
|
|
||||||
if [ $blocked_time -gt $STUCK_PEERING_RESTART_THRESH ]; then
|
if [ $blocked_time -gt $STUCK_PEERING_RESTART_THRESH ]; then
|
||||||
log $name "WARN" "Detected stuck peering for $blocked_time seconds"
|
log $name "WARN" "Detected stuck peering for $blocked_time seconds"
|
||||||
echo "true"; return
|
return 0
|
||||||
else
|
else
|
||||||
echo "false"; return
|
return 1
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
# register the time for first detected stuck peering
|
# register the time for first detected stuck peering
|
||||||
@ -524,8 +501,6 @@ osd_has_stuck_peering() {
|
|||||||
else
|
else
|
||||||
rm -f ${file} 2>/dev/null
|
rm -f ${file} 2>/dev/null
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
######################
|
######################
|
||||||
@ -602,7 +577,7 @@ daemon_is_running() {
|
|||||||
pidfile=$4
|
pidfile=$4
|
||||||
do_cmd "[ -e $pidfile ] || exit 1 # no pid, presumably not running
|
do_cmd "[ -e $pidfile ] || exit 1 # no pid, presumably not running
|
||||||
pid=\`cat $pidfile\`
|
pid=\`cat $pidfile\`
|
||||||
ps -p \$pid -o args= | grep $daemon | grep -qwe -i.$daemon_id && exit 0 # running
|
cat /proc/\$pid/cmdline | tr '\\0' ' ' | grep $daemon | grep -qwe -i.$daemon_id && exit 0 # running
|
||||||
exit 1 # pid is something else" "" "okfail"
|
exit 1 # pid is something else" "" "okfail"
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1119,9 +1094,6 @@ EOF
|
|||||||
status)
|
status)
|
||||||
if daemon_is_running $name ceph-$type $id $pid_file; then
|
if daemon_is_running $name ceph-$type $id $pid_file; then
|
||||||
|
|
||||||
# ceph processes answer in around 100ms when the process works correctly
|
|
||||||
do_cmd "timeout 1 $BINDIR/ceph --admin-daemon $asok version 2>/dev/null || echo unknown"
|
|
||||||
|
|
||||||
# log ceph osd state
|
# log ceph osd state
|
||||||
if [ "$type" = "osd" ];then
|
if [ "$type" = "osd" ];then
|
||||||
CEPH_DAEMON_STATUS=""
|
CEPH_DAEMON_STATUS=""
|
||||||
@ -1135,8 +1107,8 @@ EOF
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# check if daemon is hung
|
# check if daemon is hung
|
||||||
is_hung=$(is_process_hung $name $type)
|
is_process_hung $name $type
|
||||||
if [ "$is_hung" = "true" ]; then
|
if [ $? -eq 0 ]; then
|
||||||
echo "$name: hung."
|
echo "$name: hung."
|
||||||
# based on http://refspecs.linuxbase.org/LSB_3.1.0/LSB-Core-generic/LSB-Core-generic/iniscrptact.html
|
# based on http://refspecs.linuxbase.org/LSB_3.1.0/LSB-Core-generic/LSB-Core-generic/iniscrptact.html
|
||||||
# exit codes from 150 to 199 are application specific, therefore we define one here
|
# exit codes from 150 to 199 are application specific, therefore we define one here
|
||||||
@ -1146,8 +1118,8 @@ EOF
|
|||||||
if [ "$type" = "osd" ] && [ $BLOCKED_OPS_DETECTION_ENABLED = "true" ]; then
|
if [ "$type" = "osd" ] && [ $BLOCKED_OPS_DETECTION_ENABLED = "true" ]; then
|
||||||
up_time=$(get_proc_run_time $name)
|
up_time=$(get_proc_run_time $name)
|
||||||
if [ $up_time -gt $BLOCKED_OPS_START_DETECTION ]; then
|
if [ $up_time -gt $BLOCKED_OPS_START_DETECTION ]; then
|
||||||
has_blocked_ops=$(osd_has_blocked_ops $name)
|
osd_has_blocked_ops $name
|
||||||
if [ "$has_blocked_ops" = "true" ]; then
|
if [ $? -eq 0 ]; then
|
||||||
echo "$name: blocked ops."
|
echo "$name: blocked ops."
|
||||||
# based on http://refspecs.linuxbase.org/LSB_3.1.0/LSB-Core-generic/LSB-Core-generic/iniscrptact.html
|
# based on http://refspecs.linuxbase.org/LSB_3.1.0/LSB-Core-generic/LSB-Core-generic/iniscrptact.html
|
||||||
# exit codes from 150 to 199 are application specific, therefore we define one here
|
# exit codes from 150 to 199 are application specific, therefore we define one here
|
||||||
|
Loading…
x
Reference in New Issue
Block a user