Fix ceph-client helm test

This patch resolves a helm test problem where the test was failing
if it found a PG state of "activating". It could also potentially
find a number of other states, like premerge or unknown, that
could also fail the test. Note that if these transient PG states are
found for more than 3 minutes, the helm test fails.

Change-Id: I071bcfedf7e4079e085c2f72d2fbab3adc0b027c
This commit is contained in:
Parsons, Cliff (cp769u) 2021-03-19 21:42:34 +00:00
parent 43226de6e3
commit 167b9eb1a8
3 changed files with 25 additions and 11 deletions

View File

@ -15,6 +15,6 @@ apiVersion: v1
appVersion: v1.0.0
description: OpenStack-Helm Ceph Client
name: ceph-client
version: 0.1.12
version: 0.1.13
home: https://github.com/ceph/ceph-client
...

View File

@ -261,7 +261,10 @@ function check_pgs() {
# Not a critical error - yet
pgs_transitioning=true
else
ceph --cluster ${CLUSTER} pg ls -f json-pretty | grep '"pgid":\|"state":' | grep -v "active" | grep -B1 '"state":' > ${inactive_pgs_file} || true
# Examine the PGs that have non-active states. Consider those PGs that
# are in a "premerge" state to be similar to active. "premerge" PGs may
# stay in that state for several minutes, and this is considered ok.
ceph --cluster ${CLUSTER} pg ls -f json-pretty | grep '"pgid":\|"state":' | grep -v -E "active|premerge" | grep -B1 '"state":' > ${inactive_pgs_file} || true
# If the inactive pgs file is non-empty, there are some inactive pgs in the cluster.
inactive_pgs=(`cat ${inactive_pgs_file} | awk -F "\"" '/pgid/{print $4}'`)
@ -270,6 +273,7 @@ function check_pgs() {
echo "Very likely the cluster is rebalancing or recovering some PG's. Checking..."
# Check for PGs that are down. These are critical errors.
down_pgs=(`cat ${inactive_pgs_file} | grep -B1 'down' | awk -F "\"" '/pgid/{print $4}'`)
if [[ ${#down_pgs[*]} -gt 0 ]]; then
# Some PGs could be down. This is really bad situation and test must fail.
@ -279,23 +283,32 @@ function check_pgs() {
exit 1
fi
non_peer_recover_pgs=(`cat ${inactive_pgs_file} | grep '"state":' | grep -v -E 'peer|recover' || true`)
if [[ ${#non_peer_recover_pgs[*]} -gt 0 ]]; then
# Check for PGs that are in some transient state due to rebalancing,
# peering or backfilling. If we see other states which are not in the
# following list of states, then we likely have a problem and need to
# exit.
transient_states='peer|recover|activating|creating|unknown'
non_transient_pgs=(`cat ${inactive_pgs_file} | grep '"state":' | grep -v -E "${transient_states}" || true`)
if [[ ${#non_transient_pgs[*]} -gt 0 ]]; then
# Some PGs could be inactive and not peering. Better we fail.
echo "We are unsure what's happening: we don't have down/stuck PGs,"
echo "but we have some inactive pgs that are not peering/recover: "
pg_list=(`sed -n '/recover\|peer/{s/.*//;x;d;};x;p;${x;p;}' ${inactive_pgs_file} | sed '/^$/d' | awk -F "\"" '/pgid/{print $4}'`)
echo "We don't have down/stuck PGs, but we have some inactive pgs that"
echo "are not in the list of allowed transient states: "
pg_list=(`sed -n '/peer\|recover\|activating\|creating\|unknown/{s/.*//;x;d;};x;p;${x;p;}' ${inactive_pgs_file} | sed '/^$/d' | awk -F "\"" '/pgid/{print $4}'`)
echo ${pg_list[*]}
echo ${non_transient_pgs[*]}
# Critical error. Fail/exit the script
exit 1
fi
peer_recover_pgs=(`cat ${inactive_pgs_file} | grep -B1 -E 'peer|recover' | awk -F "\"" '/pgid/{print $4}'`)
if [[ ${#peer_recover_pgs[*]} -gt 0 ]]; then
# Check and note which PGs are in a transient state. This script
# will allow these transient states for a period of time
# (time_between_retries * max_retries seconds).
transient_pgs=(`cat ${inactive_pgs_file} | grep -B1 -E "${transient_states}" | awk -F "\"" '/pgid/{print $4}'`)
if [[ ${#transient_pgs[*]} -gt 0 ]]; then
# Some PGs are not in an active state but peering and/or cluster is recovering
echo "Some PGs are peering and/or cluster is recovering: "
echo ${peer_recover_pgs[*]}
echo "This is normal but will wait a while to verify the PGs are not stuck in peering."
echo ${transient_pgs[*]}
echo "This is normal but will wait a while to verify the PGs are not stuck in a transient state."
# not critical, just wait
pgs_transitioning=true
fi

View File

@ -13,4 +13,5 @@ ceph-client:
- 0.1.10 Separate pool quotas from pg_num calculations
- 0.1.11 enhance logic to enable and disable the autoscaler
- 0.1.12 Disable autoscaling before pools are created
- 0.1.13 Fix ceph-client helm test
...