Fix ceph-client helm test

This patch resolves a helm test problem where the test was failing if it found a PG state of "activating". It could also potentially find a number of other states, like premerge or unknown, that could also fail the test. Note that if these transient PG states are found for more than 3 minutes, the helm test fails. Change-Id: I071bcfedf7e4079e085c2f72d2fbab3adc0b027c
2021-03-19 21:42:34 +00:00 · 2021-03-19 21:42:34 +00:00 · 167b9eb1a8
commit 167b9eb1a8
parent 43226de6e3
3 changed files with 25 additions and 11 deletions
--- a/ceph-client/Chart.yaml
+++ b/ceph-client/Chart.yaml
@ -15,6 +15,6 @@ apiVersion: v1
 appVersion: v1.0.0
 description: OpenStack-Helm Ceph Client
 name: ceph-client
-version: 0.1.12
+version: 0.1.13
 home: https://github.com/ceph/ceph-client
 ...
--- a/ceph-client/templates/bin/_helm-tests.sh.tpl
+++ b/ceph-client/templates/bin/_helm-tests.sh.tpl
@ -261,7 +261,10 @@ function check_pgs() {
    # Not a critical error - yet
    pgs_transitioning=true
  else
-    ceph --cluster ${CLUSTER} pg ls -f json-pretty | grep '"pgid":\|"state":' | grep -v "active" | grep -B1 '"state":' > ${inactive_pgs_file} || true
+    # Examine the PGs that have non-active states. Consider those PGs that
+    # are in a "premerge" state to be similar to active. "premerge" PGs may
+    # stay in that state for several minutes, and this is considered ok.
+    ceph --cluster ${CLUSTER} pg ls -f json-pretty | grep '"pgid":\|"state":' | grep -v -E "active|premerge" | grep -B1 '"state":' > ${inactive_pgs_file} || true

    # If the inactive pgs file is non-empty, there are some inactive pgs in the cluster.
    inactive_pgs=(`cat ${inactive_pgs_file} | awk -F "\"" '/pgid/{print $4}'`)
@ -270,6 +273,7 @@ function check_pgs() {

    echo "Very likely the cluster is rebalancing or recovering some PG's. Checking..."

+    # Check for PGs that are down. These are critical errors.
    down_pgs=(`cat ${inactive_pgs_file} | grep -B1 'down' | awk -F "\"" '/pgid/{print $4}'`)
    if [[ ${#down_pgs[*]} -gt 0 ]]; then
      # Some PGs could be down. This is really bad situation and test must fail.
@ -279,23 +283,32 @@ function check_pgs() {
      exit 1
    fi

-    non_peer_recover_pgs=(`cat ${inactive_pgs_file} | grep '"state":' | grep -v -E 'peer|recover' || true`)
-    if [[ ${#non_peer_recover_pgs[*]} -gt 0 ]]; then
+    # Check for PGs that are in some transient state due to rebalancing,
+    # peering or backfilling. If we see other states which are not in the
+    # following list of states, then we likely have a problem and need to
+    # exit.
+    transient_states='peer|recover|activating|creating|unknown'
+    non_transient_pgs=(`cat ${inactive_pgs_file} | grep '"state":' | grep -v -E "${transient_states}" || true`)
+    if [[ ${#non_transient_pgs[*]} -gt 0 ]]; then
      # Some PGs could be inactive and not peering. Better we fail.
-      echo "We are unsure what's happening: we don't have down/stuck PGs,"
-      echo "but we have some inactive pgs that are not peering/recover: "
-      pg_list=(`sed -n '/recover\|peer/{s/.*//;x;d;};x;p;${x;p;}' ${inactive_pgs_file} | sed '/^$/d' | awk -F "\"" '/pgid/{print $4}'`)
+      echo "We don't have down/stuck PGs, but we have some inactive pgs that"
+      echo "are not in the list of allowed transient states: "
+      pg_list=(`sed -n '/peer\|recover\|activating\|creating\|unknown/{s/.*//;x;d;};x;p;${x;p;}' ${inactive_pgs_file} | sed '/^$/d' | awk -F "\"" '/pgid/{print $4}'`)
      echo ${pg_list[*]}
+      echo ${non_transient_pgs[*]}
      # Critical error. Fail/exit the script
      exit 1
    fi

-    peer_recover_pgs=(`cat ${inactive_pgs_file} | grep -B1 -E 'peer|recover' | awk -F "\"" '/pgid/{print $4}'`)
-    if [[ ${#peer_recover_pgs[*]} -gt 0 ]]; then
+    # Check and note which PGs are in a transient state. This script
+    # will allow these transient states for a period of time
+    # (time_between_retries * max_retries seconds).
+    transient_pgs=(`cat ${inactive_pgs_file} | grep -B1 -E "${transient_states}" | awk -F "\"" '/pgid/{print $4}'`)
+    if [[ ${#transient_pgs[*]} -gt 0 ]]; then
      # Some PGs are not in an active state but peering and/or cluster is recovering
      echo "Some PGs are peering and/or cluster is recovering: "
-      echo ${peer_recover_pgs[*]}
-      echo "This is normal but will wait a while to verify the PGs are not stuck in peering."
+      echo ${transient_pgs[*]}
+      echo "This is normal but will wait a while to verify the PGs are not stuck in a transient state."
      # not critical, just wait
      pgs_transitioning=true
    fi
--- a/releasenotes/notes/ceph-client.yaml
+++ b/releasenotes/notes/ceph-client.yaml
@ -13,4 +13,5 @@ ceph-client:
  - 0.1.10 Separate pool quotas from pg_num calculations
  - 0.1.11 enhance logic to enable and disable the autoscaler
  - 0.1.12 Disable autoscaling before pools are created
+  - 0.1.13 Fix ceph-client helm test
 ...