From 7f588d1f7c9b4026dd5dfd600a9256cd333013fd Mon Sep 17 00:00:00 2001 From: Felipe Sanches Zanoni Date: Wed, 21 Aug 2024 14:14:59 -0300 Subject: [PATCH] Fix ceph init script variable caching The Ceph init script status function should cache output from Ceph CLI commands that are costly: ceph status, ceph osd tree and ceph health details commands. The variable caching was not working because the Ceph CLI commands were being called in subshells, which does not share the variables, leading to empty caches. The script is now calling functions instead of subshells and the caching is working properly, reducing the execution time of the status commands. Other changes included: - removal of unused execute_ceph_cmd() which was always being overriden/used from ceph_common.sh - removal of ceph version call Test-Plan: PASS: AIO-SX/AIO-DX/Standard/Storage: Check if ceph init script works without any errors when calling start/stop/status commands. Partial-bug: 2077673 Signed-off-by: Felipe Sanches Zanoni Change-Id: I4cf26fec24d4b1cef86a928f044fb1818caa6705 --- .../debian/deb_folder/ceph-base.ceph.init | 82 ++++++------------- ceph/ceph/files/ceph-init-wrapper.sh | 2 +- 2 files changed, 28 insertions(+), 56 deletions(-) diff --git a/ceph/ceph/debian/deb_folder/ceph-base.ceph.init b/ceph/ceph/debian/deb_folder/ceph-base.ceph.init index 21b1c05c3..322a6ca68 100755 --- a/ceph/ceph/debian/deb_folder/ceph-base.ceph.init +++ b/ceph/ceph/debian/deb_folder/ceph-base.ceph.init @@ -290,29 +290,6 @@ run_state_machine() { fi } -CEPH_FAILURE="" -execute_ceph_cmd() { - # execute a comand and in case it timeouts mark ceph as failed - local name=$1 - local cmd=$2 - local cmd="timeout $WAIT_FOR_CMD $cmd" - set -o pipefail - eval "$cmd >$DATA_PATH/.ceph_cmd_out" - errcode=$? - set +o pipefail - if [ -z "$output" ] && [ $errcode -eq 124 ]; then # 'timeout' returns 124 when timing out - log $name "WARN" "Ceph cluster failed to respond in ${WAIT_FOR_CMD}s when running: $cmd" - CEPH_FAILURE="true" - echo ""; return 1 - fi - output=$(cat $DATA_PATH/.ceph_cmd_out) - if [ -z "$output" ] || [ $errcode -ne 0 ]; then - log $name "WARN" "Error executing: $cmd errorcode: $errcode output: $output" - echo ""; return 1 - fi - echo "$output"; return $errcode -} - CEPH_OSD_TREE="" CEPH_HEALTH_DETAIL="" is_process_hung() { @@ -322,7 +299,7 @@ is_process_hung() { # Abort if we had previous errors with Ceph if [ "$CEPH_FAILURE" = "true" ]; then log $name "WARN" "Ceph cluster is marked as failed, aborting hang check" - echo "false"; return + return 1 fi # Cache Ceph Health for later use as calling Ceph takes time @@ -330,7 +307,7 @@ is_process_hung() { execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail" if [ $? -ne 0 ]; then log $name "WARN" "Aborting hang check" - echo "false"; return + return 1 fi fi @@ -341,7 +318,7 @@ is_process_hung() { $(echo "$CEPH_HEALTH_DETAIL" | grep -q "noup.*set") if [ $? -eq 0 ]; then log $name "WARN" "Ceph 'noup' flag is set, aborting hang check" - echo "false"; return + return 1 fi # Multiple OSD processes may be running, so we only run @@ -350,7 +327,7 @@ is_process_hung() { execute_ceph_cmd CEPH_OSD_TREE $name "ceph osd tree" if [ $? -ne 0 ]; then log $name "WARN" "Ceph cmd exec failed, aborting hang check" - echo "false"; return + return 1 fi fi @@ -365,9 +342,9 @@ is_process_hung() { local state=$(run_state_machine $name $type $osd_status \ $WAIT_FOR_OSD_OPERATIONAL $WAIT_FOR_OSD_DOWN_CONFIRM) if [ "$state" = "$ST_HANGED" ]; then - echo "true"; return + return 0 else - echo "false"; return + return 1 fi @@ -383,18 +360,18 @@ is_process_hung() { local state=$(run_state_machine $name $type $mon_status \ $WAIT_FOR_MON_OPERATIONAL $WAIT_FOR_MON_DOWN_CONFIRM) if [ "$state" = "$ST_HANGED" ]; then - echo "true"; return + return 0 else - echo "false"; return + return 1 fi elif [ "$type" = "mds" ]; then - echo "false"; return + return 1 else log $name "WARN" "Unknown process type: $type" fi - echo "false" + return 1 } osd_has_blocked_ops() { @@ -403,7 +380,7 @@ osd_has_blocked_ops() { # Abort if we had previous errors with Ceph if [ "$CEPH_FAILURE" = "true" ]; then log $name "WARN" "Ceph cluster is marked as failed, aborting blocked ops check" - echo "false"; return + return 1 fi # Cache Ceph Health for later use as calling Ceph takes time This is @@ -413,7 +390,7 @@ osd_has_blocked_ops() { execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail" if [ $? -ne 0 ]; then log $name "WARN" "Aborting blocked ops check" - echo "false"; return + return 1 fi fi @@ -422,7 +399,7 @@ osd_has_blocked_ops() { $(echo "$CEPH_HEALTH_DETAIL" | grep -q "noup.*set") if [ $? -eq 0 ]; then log $name "WARN" "Ceph 'noup' flag is set, aborting blocked ops check" - echo "false"; return + return 1 fi # Multiple OSD processes may be running, so we only run 'ceph osd tree' once @@ -432,7 +409,7 @@ osd_has_blocked_ops() { execute_ceph_cmd CEPH_OSD_TREE $name "ceph osd tree" if [ $? -ne 0 ]; then log $name "WARN" "Ceph cmd exec failed, aborting blocked ops check" - echo "false"; return + return 1 fi fi @@ -444,9 +421,9 @@ osd_has_blocked_ops() { [[ "$blocked_time" == "" ]] && blocked_time=0 if [ $blocked_time -gt $BLOCKED_OPS_RESTART_THRESH ]; then log $name "WARN" "Detected blocked operations for $blocked_time seconds" - echo "true"; return + return 0 else - echo "false"; return + return 1 fi fi } @@ -458,7 +435,7 @@ osd_has_stuck_peering() { # Abort if we had previous errors with Ceph if [ "$CEPH_FAILURE" = "true" ]; then log $name "WARN" "Ceph cluster is marked as failed, aborting stuck peering check" - echo "false"; return + return 1 fi # Cache Ceph Health for later use as calling Ceph takes time This is @@ -468,7 +445,7 @@ osd_has_stuck_peering() { execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail" if [ $? -ne 0 ]; then log $name "WARN" "Aborting stuck peering check" - echo "false"; return + return 1 fi fi @@ -477,7 +454,7 @@ osd_has_stuck_peering() { $(echo "$CEPH_HEALTH_DETAIL" | grep -q "nodown.*set") if [ $? -eq 0 ]; then log $name "WARN" "Ceph 'nodown' flag is set, aborting stuck peering check" - echo "false"; return + return 1 fi @@ -513,9 +490,9 @@ osd_has_stuck_peering() { if [ $blocked_time -gt $STUCK_PEERING_RESTART_THRESH ]; then log $name "WARN" "Detected stuck peering for $blocked_time seconds" - echo "true"; return + return 0 else - echo "false"; return + return 1 fi else # register the time for first detected stuck peering @@ -524,8 +501,6 @@ osd_has_stuck_peering() { else rm -f ${file} 2>/dev/null fi - - } ###################### @@ -601,8 +576,8 @@ daemon_is_running() { daemon_id=$3 pidfile=$4 do_cmd "[ -e $pidfile ] || exit 1 # no pid, presumably not running - pid=\`cat $pidfile\` - ps -p \$pid -o args= | grep $daemon | grep -qwe -i.$daemon_id && exit 0 # running + pid=\`cat $pidfile\` + cat /proc/\$pid/cmdline | tr '\\0' ' ' | grep $daemon | grep -qwe -i.$daemon_id && exit 0 # running exit 1 # pid is something else" "" "okfail" } @@ -1119,9 +1094,6 @@ EOF status) if daemon_is_running $name ceph-$type $id $pid_file; then - # ceph processes answer in around 100ms when the process works correctly - do_cmd "timeout 1 $BINDIR/ceph --admin-daemon $asok version 2>/dev/null || echo unknown" - # log ceph osd state if [ "$type" = "osd" ];then CEPH_DAEMON_STATUS="" @@ -1135,8 +1107,8 @@ EOF fi # check if daemon is hung - is_hung=$(is_process_hung $name $type) - if [ "$is_hung" = "true" ]; then + is_process_hung $name $type + if [ $? -eq 0 ]; then echo "$name: hung." # based on http://refspecs.linuxbase.org/LSB_3.1.0/LSB-Core-generic/LSB-Core-generic/iniscrptact.html # exit codes from 150 to 199 are application specific, therefore we define one here @@ -1146,8 +1118,8 @@ EOF if [ "$type" = "osd" ] && [ $BLOCKED_OPS_DETECTION_ENABLED = "true" ]; then up_time=$(get_proc_run_time $name) if [ $up_time -gt $BLOCKED_OPS_START_DETECTION ]; then - has_blocked_ops=$(osd_has_blocked_ops $name) - if [ "$has_blocked_ops" = "true" ]; then + osd_has_blocked_ops $name + if [ $? -eq 0 ]; then echo "$name: blocked ops." # based on http://refspecs.linuxbase.org/LSB_3.1.0/LSB-Core-generic/LSB-Core-generic/iniscrptact.html # exit codes from 150 to 199 are application specific, therefore we define one here diff --git a/ceph/ceph/files/ceph-init-wrapper.sh b/ceph/ceph/files/ceph-init-wrapper.sh index d1ad73b83..c772598c0 100755 --- a/ceph/ceph/files/ceph-init-wrapper.sh +++ b/ceph/ceph/files/ceph-init-wrapper.sh @@ -364,7 +364,7 @@ stop () log INFO "Ceph ${cmd^^} ${service} command received." - if [ ! -z "${service}"]; then + if [ ! -z "${service}" ]; then has_daemon_running ${service} if [ $? -ne 0 ]; then log INFO "Ceph ${service} daemon is already stopped. No action is required."