Improve Ceph status check

Improve the Ceph status check by calling the Ceph restful API using
curl instead of the Ceph CLI. Uses the Ceph CLI as a fallback when
the restful api is not working.

The Ceph CLI takes around 250 ms to reply to a request, while the
restful API takes around 35 ms for the same request. It reduces the
load when checking Ceph status.

To make rest API calls using curl, it needs the service url and the
restful key. Since it takes time and resources to acquire those
values, they are cached in the /var/run/ceph_hang directory. The
.ceph_mgr_restful_key keeps the restful API key and the
.ceph_mgr_service keeps the url to the service.

Test-Plan:
 Test coverage:
  - Read/parse osd flag noup with error string injection;
  - Read/parse Ceph osd tree with error string injection;
  - Read/parse Ceph status with error string injection;
  - Read/parse Ceph health detail with error string injection;
  - Fallback to Ceph CLI when restful API is not available;
  - Swact controllers;
  - Power cycle standby controller;
  - Power cycle active controller;
  - Power cycle compute/storage node with ceph monitor;
  - DOR testing.

 PASS: AIO-SX
 PASS: AIO-DX
 PASS: Standard 2+2
 PASS: Storage 2+2+2

Closes-bug: 2077673

Signed-off-by: Felipe Sanches Zanoni <Felipe.SanchesZanoni@windriver.com>
Change-Id: I635bb0dc8dc674ff92567f71be6305271a6f87e0
This commit is contained in:
Felipe Sanches Zanoni 2024-08-23 09:11:46 -03:00
parent 67b90e5a70
commit 2b0c73a1c8

View File

@ -293,6 +293,115 @@ run_state_machine() {
fi fi
} }
load_ceph_mgr_restful_key() {
[ ! -z "${MGR_RESTFUL_KEY}" ] && return 0
MGR_RESTFUL_KEY=$(cat "${DATA_PATH}/.ceph_mgr_restful_key")
[ -z "${MGR_RESTFUL_KEY}" ] && renew_ceph_mgr_restful_key
[ -z "${MGR_RESTFUL_KEY}" ] && return 1
return 0
}
renew_ceph_mgr_restful_key() {
local ceph_cmd="ceph restful list-keys"
execute_ceph_cmd MGR_RESTFUL_KEY "" "$ceph_cmd"
[ $? -ne 0 ] && return 1
MGR_RESTFUL_KEY=$(echo -e "${MGR_RESTFUL_KEY}" | awk '/admin/ { gsub("\"", "", $2); print $2 }')
if [ ! -z "${MGR_RESTFUL_KEY}" ]; then
echo ${MGR_RESTFUL_KEY} > ${DATA_PATH}/.ceph_mgr_restful_key
return 0
fi
return 1
}
load_ceph_mgr_restful_service() {
[ ! -z "${MGR_RESTFUL_SERVICE}" ] && return 0
MGR_RESTFUL_SERVICE=$(cat "${DATA_PATH}/.ceph_mgr_restful_service")
[ -z "${MGR_RESTFUL_SERVICE}" ] && renew_ceph_mgr_restful_service
[ -z "${MGR_RESTFUL_SERVICE}" ] && return 1
return 0
}
renew_ceph_mgr_restful_service() {
local ceph_cmd="ceph mgr services"
execute_ceph_cmd MGR_RESTFUL_SERVICE "" "$ceph_cmd"
[ $? -ne 0 ] && return 1
MGR_RESTFUL_SERVICE=$(echo -e "${MGR_RESTFUL_SERVICE}" | awk '/restful/ {gsub ("\"", "", $2); split($2, str, "/"); print str[3]}')
if [ ! -z "${MGR_RESTFUL_SERVICE}" ]; then
echo ${MGR_RESTFUL_SERVICE} > ${DATA_PATH}/.ceph_mgr_restful_service
return 0
fi
return 1
}
get_ceph_health_detail() {
if [ -z "${CEPH_HEALTH_DETAIL}" ]; then
process_curl_output CEPH_HEALTH_DETAIL '{"format": "text", "prefix": "health", "detail": "detail"}' "ceph health detail"
if [ $? -ne 0 ]; then return 1; fi
fi
return 0
}
get_ceph_osd_tree() {
if [ -z "${CEPH_OSD_TREE}" ]; then
process_curl_output CEPH_OSD_TREE '{"format": "text", "prefix": "osd tree"}' "ceph osd tree"
if [ $? -ne 0 ]; then return 1; fi
fi
return 0
}
get_ceph_status() {
if [ -z "${CEPH_STATUS}" ]; then
process_curl_output CEPH_STATUS '{"format": "text", "prefix": "status"}' "ceph status"
if [ $? -ne 0 ]; then return 1; fi
fi
return 0
}
process_curl_output() {
local curl_cmd="$2"
local ceph_cmd="$3"
local output=""
local curl_max_time_sec=5
local curl_output=""
load_ceph_mgr_restful_service
load_ceph_mgr_restful_key
curl_output=$(curl --max-time ${curl_max_time_sec} -k https://admin:${MGR_RESTFUL_KEY}@${MGR_RESTFUL_SERVICE}/request?wait=1 -X POST -H 'Content-Type: application/json' -d "${curl_cmd}" 2>/dev/null)
local error_code=$?
if [ ${error_code} -eq 0 ]; then
output=$(echo "${curl_output}" | awk '/"outb"|"message"/ {if (match($0, "Incorrect password")) {print "Wrong password"}; if (match($0, "outb")) {split($0, msg, "\""); gsub(/\\n/, "\n", msg[4]); print msg[4]} }')
else
output="No connection"
fi
if [ "${output}" == "Wrong password" ]; then
log "restful" "WARN" "Ceph restful api command failed: Wrong password"
renew_ceph_mgr_restful_key
log "restful" "WARN" "Ceph restful api command failed: Fallback to Ceph CLI"
execute_ceph_cmd output "" "${ceph_cmd}"
[ $? -ne 0 ] && return 1
elif [ "${output}" == "No connection" ]; then
log "restful" "WARN" "Ceph restful api command failed: No connection"
renew_ceph_mgr_restful_service
log "restful" "WARN" "Ceph restful api command failed: Fallback to Ceph CLI"
execute_ceph_cmd output "" "${ceph_cmd}"
[ $? -ne 0 ] && return 1
fi
eval $1=\""${output}"\"
return 0
}
CEPH_OSD_TREE="" CEPH_OSD_TREE=""
CEPH_HEALTH_DETAIL="" CEPH_HEALTH_DETAIL=""
is_process_hung() { is_process_hung() {
@ -312,12 +421,10 @@ is_process_hung() {
fi fi
# Cache Ceph Health for later use as calling Ceph takes time # Cache Ceph Health for later use as calling Ceph takes time
if [ -z "$CEPH_HEALTH_DETAIL" ]; then get_ceph_health_detail
execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail" if [ $? -ne 0 ]; then
if [ $? -ne 0 ]; then log $name "WARN" "Aborting hang check"
log $name "WARN" "Aborting hang check" return 1
return 1
fi
fi fi
# Check if an OSD is hung # Check if an OSD is hung
@ -332,12 +439,10 @@ is_process_hung() {
# Multiple OSD processes may be running, so we only run # Multiple OSD processes may be running, so we only run
# 'ceph osd tree' once as it takes some time to execute # 'ceph osd tree' once as it takes some time to execute
if [ -z "$CEPH_OSD_TREE" ]; then get_ceph_osd_tree
execute_ceph_cmd CEPH_OSD_TREE $name "ceph osd tree" if [ $? -ne 0 ]; then
if [ $? -ne 0 ]; then log $name "WARN" "Ceph cmd exec failed, aborting hang check"
log $name "WARN" "Ceph cmd exec failed, aborting hang check" return 1
return 1
fi
fi fi
# Get osd status as 'up' or, for any other output, as 'down' # Get osd status as 'up' or, for any other output, as 'down'
@ -395,12 +500,10 @@ osd_has_blocked_ops() {
# Cache Ceph Health for later use as calling Ceph takes time This is # Cache Ceph Health for later use as calling Ceph takes time This is
# initially cached from the hang check but check and call again here if # initially cached from the hang check but check and call again here if
# needed # needed
if [ -z "$CEPH_HEALTH_DETAIL" ]; then get_ceph_health_detail
execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail" if [ $? -ne 0 ]; then
if [ $? -ne 0 ]; then log $name "WARN" "Aborting blocked ops check"
log $name "WARN" "Aborting blocked ops check" return 1
return 1
fi
fi fi
# Ignore health check if OSDs are administratively down # Ignore health check if OSDs are administratively down
@ -414,12 +517,10 @@ osd_has_blocked_ops() {
# Multiple OSD processes may be running, so we only run 'ceph osd tree' once # Multiple OSD processes may be running, so we only run 'ceph osd tree' once
# as it takes some time to execute. This is initially cached from the hang # as it takes some time to execute. This is initially cached from the hang
# check but check and call again here if needed # check but check and call again here if needed
if [ -z "$CEPH_OSD_TREE" ]; then get_ceph_osd_tree
execute_ceph_cmd CEPH_OSD_TREE $name "ceph osd tree" if [ $? -ne 0 ]; then
if [ $? -ne 0 ]; then log $name "WARN" "Ceph cmd exec failed, aborting blocked ops check"
log $name "WARN" "Ceph cmd exec failed, aborting blocked ops check" return 1
return 1
fi
fi fi
# Get osd status as 'up' or, for any other output, as 'down' # Get osd status as 'up' or, for any other output, as 'down'
@ -450,12 +551,10 @@ osd_has_stuck_peering() {
# Cache Ceph Health for later use as calling Ceph takes time This is # Cache Ceph Health for later use as calling Ceph takes time This is
# initially cached from the hang check but check and call again here if # initially cached from the hang check but check and call again here if
# needed # needed
if [ -z "$CEPH_HEALTH_DETAIL" ]; then get_ceph_health_detail
execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail" if [ $? -ne 0 ]; then
if [ $? -ne 0 ]; then log $name "WARN" "Aborting stuck peering check"
log $name "WARN" "Aborting stuck peering check" return 1
return 1
fi
fi fi
# Ignore health check if OSDs are administratively up # Ignore health check if OSDs are administratively up
@ -470,7 +569,7 @@ osd_has_stuck_peering() {
file="${DATA_PATH}/.${name}_stuck_peering_start" file="${DATA_PATH}/.${name}_stuck_peering_start"
max_blocked_time=0 max_blocked_time=0
$(echo "$CEPH_HEALTH_DETAIL" | grep "stuck peering" | awk '{split($0,a,"acting"); print a[2]}' | grep -q $id) $(echo "$CEPH_HEALTH_DETAIL" | awk '/stuck peering/ {split($0,a,"acting"); print a[2]}' | grep -q $id)
if [ "$?" -eq 0 ]; then if [ "$?" -eq 0 ]; then
while read -r line; do while read -r line; do
$(echo $line | awk '{split($0,a,"acting"); print a[2]}' | grep -q $id) $(echo $line | awk '{split($0,a,"acting"); print a[2]}' | grep -q $id)
@ -599,7 +698,7 @@ stop_daemon() {
timeout=$6 timeout=$6
[ -z "$action" ] && action="Stopping" [ -z "$action" ] && action="Stopping"
printf "$action Ceph $name on $host..." printf "$action Ceph $name on $host..."
do_cmd "if [ -e $pidfile ] ; then do_cmd "if [ -e $pidfile ] ; then
pid=\`cat $pidfile\` pid=\`cat $pidfile\`
timeout=$timeout timeout=$timeout
while ps -p \$pid -o args= | grep -q $daemon; do while ps -p \$pid -o args= | grep -q $daemon; do
@ -762,8 +861,7 @@ if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" != "simplex" ]; th
has_mon=1 has_mon=1
else else
has_mon=0 has_mon=0
CEPH_STATUS='' get_ceph_status
execute_ceph_cmd CEPH_STATUS "ceph status" "ceph -s"
if [ $? -eq 0 ]; then if [ $? -eq 0 ]; then
has_mon=1 has_mon=1
fi fi
@ -792,8 +890,7 @@ fi
# This is needed only for Standard deployments # This is needed only for Standard deployments
if [ "$system_type" == "Standard" ]; then if [ "$system_type" == "Standard" ]; then
CEPH_STATUS='' get_ceph_status
execute_ceph_cmd CEPH_STATUS "ceph status" "ceph -s"
if [ "$?" -ne 0 ]; then if [ "$?" -ne 0 ]; then
what_out= what_out=
for name in $what; do for name in $what; do
@ -1083,8 +1180,7 @@ EOF
[ -n "$lockfile" ] && [ "$?" -eq 0 ] && rm -f $lockfile [ -n "$lockfile" ] && [ "$?" -eq 0 ] && rm -f $lockfile
# flush journal to data disk in background # flush journal to data disk in background
if [ "${type}" = "osd" ];then if [ "${type}" = "osd" ];then
CMD_OUTPUT='' get_ceph_status
execute_ceph_cmd CMD_OUTPUT "Ceph Status" "ceph -s"
if [ $? == 0 ]; then if [ $? == 0 ]; then
log "${name}" "INFO" "Flushing journal" log "${name}" "INFO" "Flushing journal"
$(/usr/bin/ceph-osd -i $id --flush-journal) & $(/usr/bin/ceph-osd -i $id --flush-journal) &
@ -1192,10 +1288,10 @@ EOF
# first try to gracefully close process, this should be fast if # first try to gracefully close process, this should be fast if
# its threads still respond to the TERM signal # its threads still respond to the TERM signal
wlog $name "DEBUG" ">>> Sending term signal" log $name "DEBUG" ">>> Sending term signal"
stop_daemon $name ceph-$type $pid_file -SIGTERM "" 5 stop_daemon $name ceph-$type $pid_file -SIGTERM "" 5
wlog $name "DEBUG" ">>> Sending kill signal"
# then just kill it # then just kill it
log $name "DEBUG" ">>> Sending kill signal"
stop_daemon $name ceph-$type $pid_file -SIGKILL stop_daemon $name ceph-$type $pid_file -SIGKILL
[ -n "$pidfile" ] && rm -f $pidfile [ -n "$pidfile" ] && rm -f $pidfile