Fix ceph init script variable caching

The Ceph init script status function should cache output from Ceph CLI commands that are costly: ceph status, ceph osd tree and ceph health details commands. The variable caching was not working because the Ceph CLI commands were being called in subshells, which does not share the variables, leading to empty caches. The script is now calling functions instead of subshells and the caching is working properly, reducing the execution time of the status commands. Other changes included: - removal of unused execute_ceph_cmd() which was always being overriden/used from ceph_common.sh - removal of ceph version call Test-Plan: PASS: AIO-SX/AIO-DX/Standard/Storage: Check if ceph init script works without any errors when calling start/stop/status commands. Partial-bug: 2077673 Signed-off-by: Felipe Sanches Zanoni <Felipe.SanchesZanoni@windriver.com> Change-Id: I4cf26fec24d4b1cef86a928f044fb1818caa6705
2024-08-21 14:14:59 -03:00 · 2024-08-21 14:14:59 -03:00 · 7f588d1f7c
commit 7f588d1f7c
parent bcfb26840b
2 changed files with 28 additions and 56 deletions
--- a/ceph/ceph/debian/deb_folder/ceph-base.ceph.init
+++ b/ceph/ceph/debian/deb_folder/ceph-base.ceph.init
@ -290,29 +290,6 @@ run_state_machine() {
    fi
 }

-CEPH_FAILURE=""
-execute_ceph_cmd() {
-    # execute a comand and in case it timeouts mark ceph as failed
-    local name=$1
-    local cmd=$2
-    local cmd="timeout $WAIT_FOR_CMD $cmd"
-    set -o pipefail
-    eval "$cmd >$DATA_PATH/.ceph_cmd_out"
-    errcode=$?
-    set +o pipefail
-    if [ -z "$output" ] && [ $errcode -eq 124 ]; then  # 'timeout' returns 124 when timing out
-        log $name "WARN" "Ceph cluster failed to respond in ${WAIT_FOR_CMD}s when running: $cmd"
-        CEPH_FAILURE="true"
-        echo ""; return 1
-    fi
-    output=$(cat $DATA_PATH/.ceph_cmd_out)
-    if [ -z "$output" ] || [ $errcode -ne 0 ]; then
-        log $name "WARN" "Error executing: $cmd errorcode: $errcode output: $output"
-        echo ""; return 1
-    fi
-    echo "$output"; return $errcode
-}
-
 CEPH_OSD_TREE=""
 CEPH_HEALTH_DETAIL=""
 is_process_hung() {
@ -322,7 +299,7 @@ is_process_hung() {
    # Abort if we had previous errors with Ceph
    if [ "$CEPH_FAILURE" = "true" ]; then
        log $name "WARN" "Ceph cluster is marked as failed, aborting hang check"
-        echo "false"; return
+        return 1
    fi

    # Cache Ceph Health for later use as calling Ceph takes time
@ -330,7 +307,7 @@ is_process_hung() {
        execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail"
        if [ $? -ne 0 ]; then
            log $name "WARN" "Aborting hang check"
-            echo "false"; return
+            return 1
        fi
    fi

@ -341,7 +318,7 @@ is_process_hung() {
        $(echo "$CEPH_HEALTH_DETAIL" | grep -q "noup.*set")
        if [ $? -eq 0 ]; then
           log $name "WARN" "Ceph 'noup' flag is set, aborting hang check"
-           echo "false"; return
+           return 1
        fi

        # Multiple OSD processes may be running, so we only run
@ -350,7 +327,7 @@ is_process_hung() {
            execute_ceph_cmd CEPH_OSD_TREE $name "ceph osd tree"
            if [ $? -ne 0 ]; then
                log $name "WARN" "Ceph cmd exec failed, aborting hang check"
-                echo "false"; return
+                return 1
            fi
        fi

@ -365,9 +342,9 @@ is_process_hung() {
        local state=$(run_state_machine $name $type $osd_status \
                      $WAIT_FOR_OSD_OPERATIONAL $WAIT_FOR_OSD_DOWN_CONFIRM)
        if [ "$state" = "$ST_HANGED" ]; then
-            echo "true"; return
+            return 0
        else
-            echo "false"; return
+            return 1
        fi


@ -383,18 +360,18 @@ is_process_hung() {
        local state=$(run_state_machine $name $type $mon_status \
                      $WAIT_FOR_MON_OPERATIONAL $WAIT_FOR_MON_DOWN_CONFIRM)
        if [ "$state" = "$ST_HANGED" ]; then
-            echo "true"; return
+            return 0
        else
-            echo "false"; return
+            return 1
        fi

    elif [ "$type" = "mds" ]; then
-        echo "false"; return
+        return 1

    else
        log $name "WARN" "Unknown process type: $type"
    fi
-    echo "false"
+    return 1
 }

 osd_has_blocked_ops() {
@ -403,7 +380,7 @@ osd_has_blocked_ops() {
    # Abort if we had previous errors with Ceph
    if [ "$CEPH_FAILURE" = "true" ]; then
        log $name "WARN" "Ceph cluster is marked as failed, aborting blocked ops check"
-        echo "false"; return
+        return 1
    fi

    # Cache Ceph Health for later use as calling Ceph takes time This is
@ -413,7 +390,7 @@ osd_has_blocked_ops() {
        execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail"
        if [ $? -ne 0 ]; then
            log $name "WARN" "Aborting blocked ops check"
-            echo "false"; return
+            return 1
        fi
    fi

@ -422,7 +399,7 @@ osd_has_blocked_ops() {
    $(echo "$CEPH_HEALTH_DETAIL" | grep -q "noup.*set")
    if [ $? -eq 0 ]; then
        log $name "WARN" "Ceph 'noup' flag is set, aborting blocked ops check"
-        echo "false"; return
+        return 1
    fi

    # Multiple OSD processes may be running, so we only run 'ceph osd tree' once
@ -432,7 +409,7 @@ osd_has_blocked_ops() {
        execute_ceph_cmd CEPH_OSD_TREE $name "ceph osd tree"
        if [ $? -ne 0 ]; then
            log $name "WARN" "Ceph cmd exec failed, aborting blocked ops check"
-            echo "false"; return
+            return 1
        fi
    fi

@ -444,9 +421,9 @@ osd_has_blocked_ops() {
        [[ "$blocked_time" == "" ]] && blocked_time=0
        if [ $blocked_time -gt $BLOCKED_OPS_RESTART_THRESH ]; then
            log $name "WARN" "Detected blocked operations for $blocked_time seconds"
-            echo "true"; return
+            return 0
        else
-            echo "false"; return
+            return 1
        fi
    fi
 }
@ -458,7 +435,7 @@ osd_has_stuck_peering() {
    # Abort if we had previous errors with Ceph
    if [ "$CEPH_FAILURE" = "true" ]; then
        log $name "WARN" "Ceph cluster is marked as failed, aborting stuck peering check"
-        echo "false"; return
+        return 1
    fi

    # Cache Ceph Health for later use as calling Ceph takes time This is
@ -468,7 +445,7 @@ osd_has_stuck_peering() {
        execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail"
        if [ $? -ne 0 ]; then
            log $name "WARN" "Aborting stuck peering check"
-            echo "false"; return
+            return 1
        fi
    fi

@ -477,7 +454,7 @@ osd_has_stuck_peering() {
    $(echo "$CEPH_HEALTH_DETAIL" | grep -q "nodown.*set")
    if [ $? -eq 0 ]; then
        log $name "WARN" "Ceph 'nodown' flag is set, aborting stuck peering check"
-        echo "false"; return
+        return 1
    fi


@ -513,9 +490,9 @@ osd_has_stuck_peering() {

            if [ $blocked_time -gt $STUCK_PEERING_RESTART_THRESH ]; then
                log $name "WARN" "Detected stuck peering for $blocked_time seconds"
-                echo "true"; return
+                return 0
            else
-                echo "false"; return
+                return 1
            fi
        else
            # register the time for first detected stuck peering
@ -524,8 +501,6 @@ osd_has_stuck_peering() {
    else
        rm -f ${file} 2>/dev/null
    fi
-
-
 }

 ######################
@ -601,8 +576,8 @@ daemon_is_running() {
    daemon_id=$3
    pidfile=$4
    do_cmd "[ -e $pidfile ] || exit 1   # no pid, presumably not running
-	pid=\`cat $pidfile\`
-	ps -p \$pid -o args= | grep $daemon | grep -qwe -i.$daemon_id && exit 0 # running
+        pid=\`cat $pidfile\`
+        cat /proc/\$pid/cmdline | tr '\\0' ' ' | grep $daemon | grep -qwe -i.$daemon_id && exit 0 # running
        exit 1  # pid is something else" "" "okfail"
 }

@ -1119,9 +1094,6 @@ EOF
 	status)
 	    if daemon_is_running $name ceph-$type $id $pid_file; then

-		# ceph processes answer in around 100ms when the process works correctly
-		do_cmd "timeout 1 $BINDIR/ceph --admin-daemon $asok version 2>/dev/null || echo unknown"
-
        	# log ceph osd state
 		if [ "$type" = "osd" ];then
 		    CEPH_DAEMON_STATUS=""
@ -1135,8 +1107,8 @@ EOF
 		fi

 		# check if daemon is hung
-		is_hung=$(is_process_hung $name $type)
-		if [ "$is_hung" = "true" ]; then
+		is_process_hung $name $type
+		if [ $? -eq 0 ]; then
 		echo "$name: hung."
 		# based on http://refspecs.linuxbase.org/LSB_3.1.0/LSB-Core-generic/LSB-Core-generic/iniscrptact.html
 		# exit codes from 150 to 199 are application specific, therefore we define one here
@ -1146,8 +1118,8 @@ EOF
 		if [ "$type" = "osd" ] && [ $BLOCKED_OPS_DETECTION_ENABLED = "true" ]; then
 		    up_time=$(get_proc_run_time $name)
 		    if [ $up_time -gt $BLOCKED_OPS_START_DETECTION ]; then
-		        has_blocked_ops=$(osd_has_blocked_ops $name)
-		        if [ "$has_blocked_ops" = "true" ]; then
+		        osd_has_blocked_ops $name
+		        if [ $? -eq 0 ]; then
 		            echo "$name: blocked ops."
 		            # based on http://refspecs.linuxbase.org/LSB_3.1.0/LSB-Core-generic/LSB-Core-generic/iniscrptact.html
 		            # exit codes from 150 to 199 are application specific, therefore we define one here
--- a/ceph/ceph/files/ceph-init-wrapper.sh
+++ b/ceph/ceph/files/ceph-init-wrapper.sh
@ -364,7 +364,7 @@ stop ()

    log INFO "Ceph ${cmd^^} ${service} command received."

-    if [ ! -z "${service}"]; then
+    if [ ! -z "${service}" ]; then
        has_daemon_running ${service}
        if [ $? -ne 0 ]; then
            log INFO "Ceph ${service} daemon is already stopped. No action is required."