diff --git a/ceph-client/Chart.yaml b/ceph-client/Chart.yaml index b369b93a3..45d584ec6 100644 --- a/ceph-client/Chart.yaml +++ b/ceph-client/Chart.yaml @@ -15,6 +15,6 @@ apiVersion: v1 appVersion: v1.0.0 description: OpenStack-Helm Ceph Client name: ceph-client -version: 0.1.12 +version: 0.1.13 home: https://github.com/ceph/ceph-client ... diff --git a/ceph-client/templates/bin/_helm-tests.sh.tpl b/ceph-client/templates/bin/_helm-tests.sh.tpl index 0906c8159..3abcf708b 100755 --- a/ceph-client/templates/bin/_helm-tests.sh.tpl +++ b/ceph-client/templates/bin/_helm-tests.sh.tpl @@ -261,7 +261,10 @@ function check_pgs() { # Not a critical error - yet pgs_transitioning=true else - ceph --cluster ${CLUSTER} pg ls -f json-pretty | grep '"pgid":\|"state":' | grep -v "active" | grep -B1 '"state":' > ${inactive_pgs_file} || true + # Examine the PGs that have non-active states. Consider those PGs that + # are in a "premerge" state to be similar to active. "premerge" PGs may + # stay in that state for several minutes, and this is considered ok. + ceph --cluster ${CLUSTER} pg ls -f json-pretty | grep '"pgid":\|"state":' | grep -v -E "active|premerge" | grep -B1 '"state":' > ${inactive_pgs_file} || true # If the inactive pgs file is non-empty, there are some inactive pgs in the cluster. inactive_pgs=(`cat ${inactive_pgs_file} | awk -F "\"" '/pgid/{print $4}'`) @@ -270,6 +273,7 @@ function check_pgs() { echo "Very likely the cluster is rebalancing or recovering some PG's. Checking..." + # Check for PGs that are down. These are critical errors. down_pgs=(`cat ${inactive_pgs_file} | grep -B1 'down' | awk -F "\"" '/pgid/{print $4}'`) if [[ ${#down_pgs[*]} -gt 0 ]]; then # Some PGs could be down. This is really bad situation and test must fail. @@ -279,23 +283,32 @@ function check_pgs() { exit 1 fi - non_peer_recover_pgs=(`cat ${inactive_pgs_file} | grep '"state":' | grep -v -E 'peer|recover' || true`) - if [[ ${#non_peer_recover_pgs[*]} -gt 0 ]]; then + # Check for PGs that are in some transient state due to rebalancing, + # peering or backfilling. If we see other states which are not in the + # following list of states, then we likely have a problem and need to + # exit. + transient_states='peer|recover|activating|creating|unknown' + non_transient_pgs=(`cat ${inactive_pgs_file} | grep '"state":' | grep -v -E "${transient_states}" || true`) + if [[ ${#non_transient_pgs[*]} -gt 0 ]]; then # Some PGs could be inactive and not peering. Better we fail. - echo "We are unsure what's happening: we don't have down/stuck PGs," - echo "but we have some inactive pgs that are not peering/recover: " - pg_list=(`sed -n '/recover\|peer/{s/.*//;x;d;};x;p;${x;p;}' ${inactive_pgs_file} | sed '/^$/d' | awk -F "\"" '/pgid/{print $4}'`) + echo "We don't have down/stuck PGs, but we have some inactive pgs that" + echo "are not in the list of allowed transient states: " + pg_list=(`sed -n '/peer\|recover\|activating\|creating\|unknown/{s/.*//;x;d;};x;p;${x;p;}' ${inactive_pgs_file} | sed '/^$/d' | awk -F "\"" '/pgid/{print $4}'`) echo ${pg_list[*]} + echo ${non_transient_pgs[*]} # Critical error. Fail/exit the script exit 1 fi - peer_recover_pgs=(`cat ${inactive_pgs_file} | grep -B1 -E 'peer|recover' | awk -F "\"" '/pgid/{print $4}'`) - if [[ ${#peer_recover_pgs[*]} -gt 0 ]]; then + # Check and note which PGs are in a transient state. This script + # will allow these transient states for a period of time + # (time_between_retries * max_retries seconds). + transient_pgs=(`cat ${inactive_pgs_file} | grep -B1 -E "${transient_states}" | awk -F "\"" '/pgid/{print $4}'`) + if [[ ${#transient_pgs[*]} -gt 0 ]]; then # Some PGs are not in an active state but peering and/or cluster is recovering echo "Some PGs are peering and/or cluster is recovering: " - echo ${peer_recover_pgs[*]} - echo "This is normal but will wait a while to verify the PGs are not stuck in peering." + echo ${transient_pgs[*]} + echo "This is normal but will wait a while to verify the PGs are not stuck in a transient state." # not critical, just wait pgs_transitioning=true fi diff --git a/releasenotes/notes/ceph-client.yaml b/releasenotes/notes/ceph-client.yaml index e9246a21d..aa3e867a8 100644 --- a/releasenotes/notes/ceph-client.yaml +++ b/releasenotes/notes/ceph-client.yaml @@ -13,4 +13,5 @@ ceph-client: - 0.1.10 Separate pool quotas from pg_num calculations - 0.1.11 enhance logic to enable and disable the autoscaler - 0.1.12 Disable autoscaling before pools are created + - 0.1.13 Fix ceph-client helm test ...