Improvements for ceph-client helm tests

This commit introduces the following helm test improvement for the
ceph-client chart:

1) Reworks the pg_validation function so that it allows some time for
peering PGs to finish peering, but fail if any other critical errors are
seen. The actual pg validation was split out into a function called
check_pgs(), and the pg_validation function manages the looping aspects.

2) The check_cluster_status function now calls pv_validation if the
cluster status is not OK. This is very similar to what was happening
before, except now, the logic will not be repeated.

Change-Id: I65906380817441bd2ff9ff9cfbf9586b6fdd2ba7
This commit is contained in:
Parsons, Cliff (cp769u) 2021-01-12 15:43:19 +00:00
parent 204c51a669
commit 970c23acf4
2 changed files with 76 additions and 25 deletions

View File

@ -15,6 +15,6 @@ apiVersion: v1
appVersion: v1.0.0
description: OpenStack-Helm Ceph Client
name: ceph-client
version: 0.1.3
version: 0.1.4
home: https://github.com/ceph/ceph-client
...

View File

@ -25,25 +25,7 @@ function check_cluster_status() {
echo "Ceph status is HEALTH_OK"
else
echo "Ceph cluster status is not HEALTH_OK, checking PG states"
retries=0
# If all PGs are active, pass
# This grep is just as robust as jq and is Ceph-version agnostic unlike jq
while [[ $(ceph pg ls -f json-pretty | grep '"state":' | grep -v "active") ]] && [[ retries -lt 60 ]]; do
# If all inactive PGs are peering, wait for peering to complete
# Run 'ceph pg ls' again before failing in case PG states have changed
if [[ $(ceph pg ls -f json-pretty | grep '"state":' | grep -v -e "active" -e "peering") ]]; then
# If inactive PGs aren't peering, fail
echo "Failure, found inactive PGs that aren't peering"
exit 1
fi
sleep 3
((retries=retries+1))
done
# If peering PGs haven't gone active after retries have expired, fail
if [[ retries -ge 60 ]]; then
echo "PGs appear to be stuck peering"
exit 1
fi
pg_validation
fi
}
@ -264,12 +246,81 @@ function pool_failuredomain_validation() {
done
}
function pg_validation() {
ceph pg ls
inactive_pgs=(`ceph --cluster ${CLUSTER} pg ls -f json-pretty | grep '"pgid":\|"state":' | grep -v "active" | grep -B1 '"state":' | awk -F "\"" '/pgid/{print $4}'`)
if [ ${#inactive_pgs[*]} -gt 0 ];then
echo "There are few incomplete pgs in the cluster"
function check_pgs() {
pgs_transitioning=false
ceph --cluster ${CLUSTER} pg dump_stuck -f json-pretty > ${stuck_pgs_file}
# Check if there are any stuck PGs, which could indicate a serious problem
# if it does not resolve itself soon.
stuck_pgs=(`cat ${stuck_pgs_file} | awk -F "\"" '/pgid/{print $4}'`)
if [[ ${#stuck_pgs[*]} -gt 0 ]]; then
# We have at least one stuck pg
echo "Some PGs are stuck: "
echo ${stuck_pgs[*]}
# Not a critical error - yet
pgs_transitioning=true
else
ceph --cluster ${CLUSTER} pg ls -f json-pretty | grep '"pgid":\|"state":' | grep -v "active" | grep -B1 '"state":' > ${inactive_pgs_file} || true
# If the inactive pgs file is non-empty, there are some inactive pgs in the cluster.
inactive_pgs=(`cat ${inactive_pgs_file} | awk -F "\"" '/pgid/{print $4}'`)
echo "There is at least one inactive pg in the cluster: "
echo ${inactive_pgs[*]}
echo "Very likely the cluster is rebalancing or recovering some PG's. Checking..."
down_pgs=(`cat ${inactive_pgs_file} | grep -B1 'down' | awk -F "\"" '/pgid/{print $4}'`)
if [[ ${#down_pgs[*]} -gt 0 ]]; then
# Some PGs could be down. This is really bad situation and test must fail.
echo "Some PGs are down: "
echo ${down_pgs[*]}
echo "This is critical error, exiting. "
exit 1
fi
non_peer_recover_pgs=(`cat ${inactive_pgs_file} | grep '"state":' | grep -v -E 'peer|recover' || true`)
if [[ ${#non_peer_recover_pgs[*]} -gt 0 ]]; then
# Some PGs could be inactive and not peering. Better we fail.
echo "We are unsure what's happening: we don't have down/stuck PGs,"
echo "but we have some inactive pgs that are not peering/recover: "
pg_list=(`sed -n '/recover\|peer/{s/.*//;x;d;};x;p;${x;p;}' ${inactive_pgs_file} | sed '/^$/d' | awk -F "\"" '/pgid/{print $4}'`)
echo ${pg_list[*]}
# Critical error. Fail/exit the script
exit 1
fi
peer_recover_pgs=(`cat ${inactive_pgs_file} | grep -B1 -E 'peer|recover' | awk -F "\"" '/pgid/{print $4}'`)
if [[ ${#peer_recover_pgs[*]} -gt 0 ]]; then
# Some PGs are not in an active state but peering and/or cluster is recovering
echo "Some PGs are peering and/or cluster is recovering: "
echo ${peer_recover_pgs[*]}
echo "This is normal but will wait a while to verify the PGs are not stuck in peering."
# not critical, just wait
pgs_transitioning=true
fi
fi
}
function pg_validation() {
retries=0
time_between_retries=3
max_retries=60
pgs_transitioning=false
stuck_pgs_file=$(mktemp -p /tmp)
inactive_pgs_file=$(mktemp -p /tmp)
# Check this over a period of retries. Fail/stop if any critical errors found.
while check_pgs && [[ "${pgs_transitioning}" == "true" ]] && [[ retries -lt ${max_retries} ]]; do
echo "Sleep for a bit waiting on the pg(s) to become active/unstuck..."
sleep ${time_between_retries}
((retries=retries+1))
done
# If peering PGs haven't gone active after retries have expired, fail
if [[ retries -ge ${max_retries} ]]; then
((timeout_sec=${time_between_retries}*${max_retries}))
echo "Some PGs have not become active or have been stuck after ${timeout_sec} seconds. Exiting..."
exit 1
fi
}