From 2b0c73a1c81c041d5c971f90465d4a789799953d Mon Sep 17 00:00:00 2001
From: Felipe Sanches Zanoni <Felipe.SanchesZanoni@windriver.com>
Date: Fri, 23 Aug 2024 09:11:46 -0300
Subject: [PATCH] Improve Ceph status check

Improve the Ceph status check by calling the Ceph restful API using
curl instead of the Ceph CLI. Uses the Ceph CLI as a fallback when
the restful api is not working.

The Ceph CLI takes around 250 ms to reply to a request, while the
restful API takes around 35 ms for the same request. It reduces the
load when checking Ceph status.

To make rest API calls using curl, it needs the service url and the
restful key. Since it takes time and resources to acquire those
values, they are cached in the /var/run/ceph_hang directory. The
.ceph_mgr_restful_key keeps the restful API key and the
.ceph_mgr_service keeps the url to the service.

Test-Plan:
 Test coverage:
  - Read/parse osd flag noup with error string injection;
  - Read/parse Ceph osd tree with error string injection;
  - Read/parse Ceph status with error string injection;
  - Read/parse Ceph health detail with error string injection;
  - Fallback to Ceph CLI when restful API is not available;
  - Swact controllers;
  - Power cycle standby controller;
  - Power cycle active controller;
  - Power cycle compute/storage node with ceph monitor;
  - DOR testing.

 PASS: AIO-SX
 PASS: AIO-DX
 PASS: Standard 2+2
 PASS: Storage 2+2+2

Closes-bug: 2077673

Signed-off-by: Felipe Sanches Zanoni <Felipe.SanchesZanoni@windriver.com>
Change-Id: I635bb0dc8dc674ff92567f71be6305271a6f87e0
---
 .../debian/deb_folder/ceph-base.ceph.init     | 176 ++++++++++++++----
 1 file changed, 136 insertions(+), 40 deletions(-)

diff --git a/ceph/ceph/debian/deb_folder/ceph-base.ceph.init b/ceph/ceph/debian/deb_folder/ceph-base.ceph.init
index fe47b34a5..d6a45c35c 100755
--- a/ceph/ceph/debian/deb_folder/ceph-base.ceph.init
+++ b/ceph/ceph/debian/deb_folder/ceph-base.ceph.init
@@ -293,6 +293,115 @@ run_state_machine() {
     fi
 }
 
+load_ceph_mgr_restful_key() {
+    [ ! -z "${MGR_RESTFUL_KEY}" ] && return 0
+
+    MGR_RESTFUL_KEY=$(cat "${DATA_PATH}/.ceph_mgr_restful_key")
+    [ -z "${MGR_RESTFUL_KEY}" ] && renew_ceph_mgr_restful_key
+    [ -z "${MGR_RESTFUL_KEY}" ] && return 1
+    return 0
+}
+
+renew_ceph_mgr_restful_key() {
+    local ceph_cmd="ceph restful list-keys"
+    execute_ceph_cmd MGR_RESTFUL_KEY "" "$ceph_cmd"
+    [ $? -ne 0 ] && return 1
+
+    MGR_RESTFUL_KEY=$(echo -e "${MGR_RESTFUL_KEY}" | awk '/admin/ { gsub("\"", "", $2); print $2 }')
+
+    if [ ! -z "${MGR_RESTFUL_KEY}" ]; then
+        echo ${MGR_RESTFUL_KEY} > ${DATA_PATH}/.ceph_mgr_restful_key
+        return 0
+    fi
+    return 1
+}
+
+load_ceph_mgr_restful_service() {
+    [ ! -z "${MGR_RESTFUL_SERVICE}" ] && return 0
+
+    MGR_RESTFUL_SERVICE=$(cat "${DATA_PATH}/.ceph_mgr_restful_service")
+    [ -z "${MGR_RESTFUL_SERVICE}" ] && renew_ceph_mgr_restful_service
+    [ -z "${MGR_RESTFUL_SERVICE}" ] && return 1
+    return 0
+}
+
+renew_ceph_mgr_restful_service() {
+    local ceph_cmd="ceph mgr services"
+    execute_ceph_cmd MGR_RESTFUL_SERVICE "" "$ceph_cmd"
+    [ $? -ne 0 ] && return 1
+
+    MGR_RESTFUL_SERVICE=$(echo -e "${MGR_RESTFUL_SERVICE}" | awk '/restful/ {gsub ("\"", "", $2); split($2, str, "/"); print str[3]}')
+
+    if [ ! -z "${MGR_RESTFUL_SERVICE}" ]; then
+        echo ${MGR_RESTFUL_SERVICE} > ${DATA_PATH}/.ceph_mgr_restful_service
+        return 0
+    fi
+    return 1
+}
+
+get_ceph_health_detail() {
+    if [ -z "${CEPH_HEALTH_DETAIL}" ]; then
+      process_curl_output CEPH_HEALTH_DETAIL '{"format": "text", "prefix": "health", "detail": "detail"}' "ceph health detail"
+      if [ $? -ne 0 ]; then return 1; fi
+    fi
+    return 0
+}
+
+get_ceph_osd_tree() {
+    if [ -z "${CEPH_OSD_TREE}" ]; then
+        process_curl_output CEPH_OSD_TREE '{"format": "text", "prefix": "osd tree"}' "ceph osd tree"
+        if [ $? -ne 0 ]; then return 1; fi
+    fi
+    return 0
+}
+
+get_ceph_status() {
+    if [ -z "${CEPH_STATUS}" ]; then
+        process_curl_output CEPH_STATUS '{"format": "text", "prefix": "status"}' "ceph status"
+        if [ $? -ne 0 ]; then return 1; fi
+    fi
+    return 0
+}
+
+process_curl_output() {
+  local curl_cmd="$2"
+  local ceph_cmd="$3"
+  local output=""
+  local curl_max_time_sec=5
+  local curl_output=""
+
+  load_ceph_mgr_restful_service
+  load_ceph_mgr_restful_key
+
+  curl_output=$(curl --max-time ${curl_max_time_sec} -k https://admin:${MGR_RESTFUL_KEY}@${MGR_RESTFUL_SERVICE}/request?wait=1 -X POST -H 'Content-Type: application/json' -d "${curl_cmd}" 2>/dev/null)
+  local error_code=$?
+
+  if [ ${error_code} -eq 0 ]; then
+    output=$(echo "${curl_output}" | awk '/"outb"|"message"/ {if (match($0, "Incorrect password")) {print "Wrong password"}; if (match($0, "outb")) {split($0, msg, "\""); gsub(/\\n/, "\n", msg[4]); print msg[4]} }')
+  else
+    output="No connection"
+  fi
+
+  if [ "${output}" == "Wrong password" ]; then
+    log "restful" "WARN" "Ceph restful api command failed: Wrong password"
+    renew_ceph_mgr_restful_key
+
+    log "restful" "WARN" "Ceph restful api command failed: Fallback to Ceph CLI"
+    execute_ceph_cmd output "" "${ceph_cmd}"
+    [ $? -ne 0 ] && return 1
+  elif [ "${output}" == "No connection" ]; then
+    log "restful" "WARN" "Ceph restful api command failed: No connection"
+    renew_ceph_mgr_restful_service
+
+    log "restful" "WARN" "Ceph restful api command failed: Fallback to Ceph CLI"
+    execute_ceph_cmd output "" "${ceph_cmd}"
+    [ $? -ne 0 ] && return 1
+  fi
+
+  eval $1=\""${output}"\"
+  return 0
+}
+
 CEPH_OSD_TREE=""
 CEPH_HEALTH_DETAIL=""
 is_process_hung() {
@@ -312,12 +421,10 @@ is_process_hung() {
     fi
 
     # Cache Ceph Health for later use as calling Ceph takes time
-    if [ -z "$CEPH_HEALTH_DETAIL" ]; then
-        execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail"
-        if [ $? -ne 0 ]; then
-            log $name "WARN" "Aborting hang check"
-            return 1
-        fi
+    get_ceph_health_detail
+    if [ $? -ne 0 ]; then
+        log $name "WARN" "Aborting hang check"
+        return 1
     fi
 
     # Check if an OSD is hung
@@ -332,12 +439,10 @@ is_process_hung() {
 
         # Multiple OSD processes may be running, so we only run
         # 'ceph osd tree' once as it takes some time to execute
-        if [ -z "$CEPH_OSD_TREE" ]; then
-            execute_ceph_cmd CEPH_OSD_TREE $name "ceph osd tree"
-            if [ $? -ne 0 ]; then
-                log $name "WARN" "Ceph cmd exec failed, aborting hang check"
-                return 1
-            fi
+        get_ceph_osd_tree
+        if [ $? -ne 0 ]; then
+            log $name "WARN" "Ceph cmd exec failed, aborting hang check"
+            return 1
         fi
 
         # Get osd status as 'up' or, for any other output, as 'down'
@@ -395,12 +500,10 @@ osd_has_blocked_ops() {
     # Cache Ceph Health for later use as calling Ceph takes time This is
     # initially cached from the hang check but check and call again here if
     # needed
-    if [ -z "$CEPH_HEALTH_DETAIL" ]; then
-        execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail"
-        if [ $? -ne 0 ]; then
-            log $name "WARN" "Aborting blocked ops check"
-            return 1
-        fi
+    get_ceph_health_detail
+    if [ $? -ne 0 ]; then
+        log $name "WARN" "Aborting blocked ops check"
+        return 1
     fi
 
     # Ignore health check if OSDs are administratively down
@@ -414,12 +517,10 @@ osd_has_blocked_ops() {
     # Multiple OSD processes may be running, so we only run 'ceph osd tree' once
     # as it takes some time to execute. This is initially cached from the hang
     # check but check and call again here if needed
-    if [ -z "$CEPH_OSD_TREE" ]; then
-        execute_ceph_cmd CEPH_OSD_TREE $name "ceph osd tree"
-        if [ $? -ne 0 ]; then
-            log $name "WARN" "Ceph cmd exec failed, aborting blocked ops check"
-            return 1
-        fi
+    get_ceph_osd_tree
+    if [ $? -ne 0 ]; then
+        log $name "WARN" "Ceph cmd exec failed, aborting blocked ops check"
+        return 1
     fi
 
     # Get osd status as 'up' or, for any other output, as 'down'
@@ -450,12 +551,10 @@ osd_has_stuck_peering() {
     # Cache Ceph Health for later use as calling Ceph takes time This is
     # initially cached from the hang check but check and call again here if
     # needed
-    if [ -z "$CEPH_HEALTH_DETAIL" ]; then
-        execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail"
-        if [ $? -ne 0 ]; then
-            log $name "WARN" "Aborting stuck peering check"
-            return 1
-        fi
+    get_ceph_health_detail
+    if [ $? -ne 0 ]; then
+        log $name "WARN" "Aborting stuck peering check"
+        return 1
     fi
 
     # Ignore health check if OSDs are administratively up
@@ -470,7 +569,7 @@ osd_has_stuck_peering() {
     file="${DATA_PATH}/.${name}_stuck_peering_start"
     max_blocked_time=0
 
-    $(echo "$CEPH_HEALTH_DETAIL" | grep "stuck peering" | awk '{split($0,a,"acting"); print a[2]}' | grep -q $id)
+    $(echo "$CEPH_HEALTH_DETAIL" | awk '/stuck peering/ {split($0,a,"acting"); print a[2]}' | grep -q $id)
     if [ "$?" -eq 0 ]; then
         while read -r line; do
             $(echo $line | awk '{split($0,a,"acting"); print a[2]}' | grep -q $id)
@@ -599,7 +698,7 @@ stop_daemon() {
     timeout=$6
     [ -z "$action" ] && action="Stopping"
     printf "$action Ceph $name on $host..."
-    do_cmd "if [ -e $pidfile ] ; then 
+    do_cmd "if [ -e $pidfile ] ; then
     pid=\`cat $pidfile\`
     timeout=$timeout
     while ps -p \$pid -o args= | grep -q $daemon; do
@@ -762,8 +861,7 @@ if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" != "simplex" ]; th
             has_mon=1
         else
             has_mon=0
-            CEPH_STATUS=''
-            execute_ceph_cmd CEPH_STATUS "ceph status" "ceph -s"
+            get_ceph_status
             if [ $? -eq 0 ]; then
                 has_mon=1
             fi
@@ -792,8 +890,7 @@ fi
 # This is needed only for Standard deployments
 
 if [ "$system_type" == "Standard" ]; then
-    CEPH_STATUS=''
-    execute_ceph_cmd CEPH_STATUS "ceph status" "ceph -s"
+    get_ceph_status
     if [ "$?" -ne 0 ]; then
         what_out=
         for name in $what; do
@@ -1083,8 +1180,7 @@ EOF
 	    [ -n "$lockfile" ] && [ "$?" -eq 0 ] && rm -f $lockfile
 	    # flush journal to data disk in background
 	    if [ "${type}" = "osd" ];then
-                CMD_OUTPUT=''
-                execute_ceph_cmd CMD_OUTPUT "Ceph Status" "ceph -s"
+                get_ceph_status
                 if [ $? == 0 ]; then
                     log "${name}" "INFO" "Flushing journal"
                     $(/usr/bin/ceph-osd -i $id --flush-journal) &
@@ -1192,10 +1288,10 @@ EOF
 
 	    # first try to gracefully close process, this should be fast if
 	    # its threads still respond to the TERM signal
-	    wlog $name "DEBUG" ">>> Sending term signal"
+	    log $name "DEBUG" ">>> Sending term signal"
 	    stop_daemon $name ceph-$type $pid_file -SIGTERM "" 5
-	    wlog $name "DEBUG" ">>> Sending kill signal"
 	    # then just kill it
+	    log $name "DEBUG" ">>> Sending kill signal"
 	    stop_daemon $name ceph-$type $pid_file -SIGKILL
 
 	    [ -n "$pidfile" ] && rm -f $pidfile