Fix ceph init script variable caching
The Ceph init script status function should cache output from Ceph CLI commands that are costly: ceph status, ceph osd tree and ceph health details commands. The variable caching was not working because the Ceph CLI commands were being called in subshells, which does not share the variables, leading to empty caches. The script is now calling functions instead of subshells and the caching is working properly, reducing the execution time of the status commands. Other changes included: - removal of unused execute_ceph_cmd() which was always being overriden/used from ceph_common.sh - removal of ceph version call Test-Plan: PASS: AIO-SX/AIO-DX/Standard/Storage: Check if ceph init script works without any errors when calling start/stop/status commands. Partial-bug: 2077673 Signed-off-by: Felipe Sanches Zanoni <Felipe.SanchesZanoni@windriver.com> Change-Id: I4cf26fec24d4b1cef86a928f044fb1818caa6705
This commit is contained in:
parent
bcfb26840b
commit
7f588d1f7c
@ -290,29 +290,6 @@ run_state_machine() {
|
||||
fi
|
||||
}
|
||||
|
||||
CEPH_FAILURE=""
|
||||
execute_ceph_cmd() {
|
||||
# execute a comand and in case it timeouts mark ceph as failed
|
||||
local name=$1
|
||||
local cmd=$2
|
||||
local cmd="timeout $WAIT_FOR_CMD $cmd"
|
||||
set -o pipefail
|
||||
eval "$cmd >$DATA_PATH/.ceph_cmd_out"
|
||||
errcode=$?
|
||||
set +o pipefail
|
||||
if [ -z "$output" ] && [ $errcode -eq 124 ]; then # 'timeout' returns 124 when timing out
|
||||
log $name "WARN" "Ceph cluster failed to respond in ${WAIT_FOR_CMD}s when running: $cmd"
|
||||
CEPH_FAILURE="true"
|
||||
echo ""; return 1
|
||||
fi
|
||||
output=$(cat $DATA_PATH/.ceph_cmd_out)
|
||||
if [ -z "$output" ] || [ $errcode -ne 0 ]; then
|
||||
log $name "WARN" "Error executing: $cmd errorcode: $errcode output: $output"
|
||||
echo ""; return 1
|
||||
fi
|
||||
echo "$output"; return $errcode
|
||||
}
|
||||
|
||||
CEPH_OSD_TREE=""
|
||||
CEPH_HEALTH_DETAIL=""
|
||||
is_process_hung() {
|
||||
@ -322,7 +299,7 @@ is_process_hung() {
|
||||
# Abort if we had previous errors with Ceph
|
||||
if [ "$CEPH_FAILURE" = "true" ]; then
|
||||
log $name "WARN" "Ceph cluster is marked as failed, aborting hang check"
|
||||
echo "false"; return
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Cache Ceph Health for later use as calling Ceph takes time
|
||||
@ -330,7 +307,7 @@ is_process_hung() {
|
||||
execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail"
|
||||
if [ $? -ne 0 ]; then
|
||||
log $name "WARN" "Aborting hang check"
|
||||
echo "false"; return
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
|
||||
@ -341,7 +318,7 @@ is_process_hung() {
|
||||
$(echo "$CEPH_HEALTH_DETAIL" | grep -q "noup.*set")
|
||||
if [ $? -eq 0 ]; then
|
||||
log $name "WARN" "Ceph 'noup' flag is set, aborting hang check"
|
||||
echo "false"; return
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Multiple OSD processes may be running, so we only run
|
||||
@ -350,7 +327,7 @@ is_process_hung() {
|
||||
execute_ceph_cmd CEPH_OSD_TREE $name "ceph osd tree"
|
||||
if [ $? -ne 0 ]; then
|
||||
log $name "WARN" "Ceph cmd exec failed, aborting hang check"
|
||||
echo "false"; return
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
|
||||
@ -365,9 +342,9 @@ is_process_hung() {
|
||||
local state=$(run_state_machine $name $type $osd_status \
|
||||
$WAIT_FOR_OSD_OPERATIONAL $WAIT_FOR_OSD_DOWN_CONFIRM)
|
||||
if [ "$state" = "$ST_HANGED" ]; then
|
||||
echo "true"; return
|
||||
return 0
|
||||
else
|
||||
echo "false"; return
|
||||
return 1
|
||||
fi
|
||||
|
||||
|
||||
@ -383,18 +360,18 @@ is_process_hung() {
|
||||
local state=$(run_state_machine $name $type $mon_status \
|
||||
$WAIT_FOR_MON_OPERATIONAL $WAIT_FOR_MON_DOWN_CONFIRM)
|
||||
if [ "$state" = "$ST_HANGED" ]; then
|
||||
echo "true"; return
|
||||
return 0
|
||||
else
|
||||
echo "false"; return
|
||||
return 1
|
||||
fi
|
||||
|
||||
elif [ "$type" = "mds" ]; then
|
||||
echo "false"; return
|
||||
return 1
|
||||
|
||||
else
|
||||
log $name "WARN" "Unknown process type: $type"
|
||||
fi
|
||||
echo "false"
|
||||
return 1
|
||||
}
|
||||
|
||||
osd_has_blocked_ops() {
|
||||
@ -403,7 +380,7 @@ osd_has_blocked_ops() {
|
||||
# Abort if we had previous errors with Ceph
|
||||
if [ "$CEPH_FAILURE" = "true" ]; then
|
||||
log $name "WARN" "Ceph cluster is marked as failed, aborting blocked ops check"
|
||||
echo "false"; return
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Cache Ceph Health for later use as calling Ceph takes time This is
|
||||
@ -413,7 +390,7 @@ osd_has_blocked_ops() {
|
||||
execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail"
|
||||
if [ $? -ne 0 ]; then
|
||||
log $name "WARN" "Aborting blocked ops check"
|
||||
echo "false"; return
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
|
||||
@ -422,7 +399,7 @@ osd_has_blocked_ops() {
|
||||
$(echo "$CEPH_HEALTH_DETAIL" | grep -q "noup.*set")
|
||||
if [ $? -eq 0 ]; then
|
||||
log $name "WARN" "Ceph 'noup' flag is set, aborting blocked ops check"
|
||||
echo "false"; return
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Multiple OSD processes may be running, so we only run 'ceph osd tree' once
|
||||
@ -432,7 +409,7 @@ osd_has_blocked_ops() {
|
||||
execute_ceph_cmd CEPH_OSD_TREE $name "ceph osd tree"
|
||||
if [ $? -ne 0 ]; then
|
||||
log $name "WARN" "Ceph cmd exec failed, aborting blocked ops check"
|
||||
echo "false"; return
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
|
||||
@ -444,9 +421,9 @@ osd_has_blocked_ops() {
|
||||
[[ "$blocked_time" == "" ]] && blocked_time=0
|
||||
if [ $blocked_time -gt $BLOCKED_OPS_RESTART_THRESH ]; then
|
||||
log $name "WARN" "Detected blocked operations for $blocked_time seconds"
|
||||
echo "true"; return
|
||||
return 0
|
||||
else
|
||||
echo "false"; return
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
}
|
||||
@ -458,7 +435,7 @@ osd_has_stuck_peering() {
|
||||
# Abort if we had previous errors with Ceph
|
||||
if [ "$CEPH_FAILURE" = "true" ]; then
|
||||
log $name "WARN" "Ceph cluster is marked as failed, aborting stuck peering check"
|
||||
echo "false"; return
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Cache Ceph Health for later use as calling Ceph takes time This is
|
||||
@ -468,7 +445,7 @@ osd_has_stuck_peering() {
|
||||
execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail"
|
||||
if [ $? -ne 0 ]; then
|
||||
log $name "WARN" "Aborting stuck peering check"
|
||||
echo "false"; return
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
|
||||
@ -477,7 +454,7 @@ osd_has_stuck_peering() {
|
||||
$(echo "$CEPH_HEALTH_DETAIL" | grep -q "nodown.*set")
|
||||
if [ $? -eq 0 ]; then
|
||||
log $name "WARN" "Ceph 'nodown' flag is set, aborting stuck peering check"
|
||||
echo "false"; return
|
||||
return 1
|
||||
fi
|
||||
|
||||
|
||||
@ -513,9 +490,9 @@ osd_has_stuck_peering() {
|
||||
|
||||
if [ $blocked_time -gt $STUCK_PEERING_RESTART_THRESH ]; then
|
||||
log $name "WARN" "Detected stuck peering for $blocked_time seconds"
|
||||
echo "true"; return
|
||||
return 0
|
||||
else
|
||||
echo "false"; return
|
||||
return 1
|
||||
fi
|
||||
else
|
||||
# register the time for first detected stuck peering
|
||||
@ -524,8 +501,6 @@ osd_has_stuck_peering() {
|
||||
else
|
||||
rm -f ${file} 2>/dev/null
|
||||
fi
|
||||
|
||||
|
||||
}
|
||||
|
||||
######################
|
||||
@ -601,8 +576,8 @@ daemon_is_running() {
|
||||
daemon_id=$3
|
||||
pidfile=$4
|
||||
do_cmd "[ -e $pidfile ] || exit 1 # no pid, presumably not running
|
||||
pid=\`cat $pidfile\`
|
||||
ps -p \$pid -o args= | grep $daemon | grep -qwe -i.$daemon_id && exit 0 # running
|
||||
pid=\`cat $pidfile\`
|
||||
cat /proc/\$pid/cmdline | tr '\\0' ' ' | grep $daemon | grep -qwe -i.$daemon_id && exit 0 # running
|
||||
exit 1 # pid is something else" "" "okfail"
|
||||
}
|
||||
|
||||
@ -1119,9 +1094,6 @@ EOF
|
||||
status)
|
||||
if daemon_is_running $name ceph-$type $id $pid_file; then
|
||||
|
||||
# ceph processes answer in around 100ms when the process works correctly
|
||||
do_cmd "timeout 1 $BINDIR/ceph --admin-daemon $asok version 2>/dev/null || echo unknown"
|
||||
|
||||
# log ceph osd state
|
||||
if [ "$type" = "osd" ];then
|
||||
CEPH_DAEMON_STATUS=""
|
||||
@ -1135,8 +1107,8 @@ EOF
|
||||
fi
|
||||
|
||||
# check if daemon is hung
|
||||
is_hung=$(is_process_hung $name $type)
|
||||
if [ "$is_hung" = "true" ]; then
|
||||
is_process_hung $name $type
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "$name: hung."
|
||||
# based on http://refspecs.linuxbase.org/LSB_3.1.0/LSB-Core-generic/LSB-Core-generic/iniscrptact.html
|
||||
# exit codes from 150 to 199 are application specific, therefore we define one here
|
||||
@ -1146,8 +1118,8 @@ EOF
|
||||
if [ "$type" = "osd" ] && [ $BLOCKED_OPS_DETECTION_ENABLED = "true" ]; then
|
||||
up_time=$(get_proc_run_time $name)
|
||||
if [ $up_time -gt $BLOCKED_OPS_START_DETECTION ]; then
|
||||
has_blocked_ops=$(osd_has_blocked_ops $name)
|
||||
if [ "$has_blocked_ops" = "true" ]; then
|
||||
osd_has_blocked_ops $name
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "$name: blocked ops."
|
||||
# based on http://refspecs.linuxbase.org/LSB_3.1.0/LSB-Core-generic/LSB-Core-generic/iniscrptact.html
|
||||
# exit codes from 150 to 199 are application specific, therefore we define one here
|
||||
|
@ -364,7 +364,7 @@ stop ()
|
||||
|
||||
log INFO "Ceph ${cmd^^} ${service} command received."
|
||||
|
||||
if [ ! -z "${service}"]; then
|
||||
if [ ! -z "${service}" ]; then
|
||||
has_daemon_running ${service}
|
||||
if [ $? -ne 0 ]; then
|
||||
log INFO "Ceph ${service} daemon is already stopped. No action is required."
|
||||
|
Loading…
x
Reference in New Issue
Block a user