[ceph-client] Fix a helm test issue and disable PG autoscaler

Currently the Ceph helm tests pass when the deployed Ceph cluster is unhealthy. This change expands the cluster status testing logic to pass when all PGs are active and fail if any PG is inactive. The PG autoscaler is currently causing the deployment to deploy unhealthy Ceph clusters. This change also disables it. It should be re-enabled once those issues are resolved. Change-Id: Iea1ff5006fc00e4570cf67c6af5ef6746a538058
2020-07-28 15:56:28 +00:00 · 2020-07-28 15:56:28 +00:00 · 84f1557566
commit 84f1557566
parent 8633b93548
3 changed files with 49 additions and 4 deletions
--- a/ceph-client/templates/bin/_helm-tests.sh.tpl
+++ b/ceph-client/templates/bin/_helm-tests.sh.tpl
@ -24,7 +24,37 @@ function check_cluster_status() {
  if [ "x${ceph_health_status}" == "xHEALTH_OK" ]; then
    echo "Ceph status is HEALTH_OK"
  else
-    echo "Ceph cluster status is NOT HEALTH_OK."
+    echo "Ceph cluster status is not HEALTH_OK, checking PG states"
+    retries=0
+    # If all PGs are active, pass
+    # This grep is just as robust as jq and is Ceph-version agnostic unlike jq
+    while [[ $(ceph pg ls -f json-pretty | grep '"state":' | grep -v "active") ]] && [[ retries -lt 60 ]]; do
+      # If all inactive PGs are peering, wait for peering to complete
+      # Run 'ceph pg ls' again before failing in case PG states have changed
+      if [[ $(ceph pg ls -f json-pretty | grep '"state":' | grep -v -e "active" -e "peering") ]]; then
+        # If inactive PGs aren't peering, fail
+        echo "Failure, found inactive PGs that aren't peering"
+        exit 1
+      fi
+      sleep 3
+      ((retries=retries+1))
+    done
+    # If peering PGs haven't gone active after retries have expired, fail
+    if [[ retries -ge 60 ]]; then
+      echo "PGs appear to be stuck peering"
+      exit 1
+    fi
+  fi
+}
+
+function check_recovery_flags() {
+  echo "### Start: Checking for flags that will prevent recovery"
+
+  # Ensure there are no flags set that will prevent recovery of degraded PGs
+  if [[ $(ceph osd stat | grep "norecover\|nobackfill\|norebalance") ]]; then
+    ceph osd stat
+    echo "Flags are set that prevent recovery of degraded PGs"
+    exit 1
  fi
 }

@ -257,3 +287,4 @@ pool_validation
 pool_failuredomain_validation
 check_failure_domain_count_per_pool
 check_cluster_status
+check_recovery_flags
--- a/ceph-client/templates/bin/pool/_init.sh.tpl
+++ b/ceph-client/templates/bin/pool/_init.sh.tpl
@ -46,6 +46,17 @@ function wait_for_inactive_pgs () {
  fi
 }

+function check_recovery_flags () {
+  echo "### Start: Checking for flags that will prevent recovery"
+
+  # Ensure there are no flags set that will prevent recovery of degraded PGs
+  if [[ $(ceph osd stat | grep "norecover\|nobackfill\|norebalance") ]]; then
+    ceph osd stat
+    echo "Flags are set that prevent recovery of degraded PGs"
+    exit 1
+  fi
+}
+
 function check_osd_count() {
  echo "#### Start: Checking OSD count ####"
  noup_flag=$(ceph osd stat | awk '/noup/ {print $2}')
@ -119,10 +130,12 @@ function reweight_osds () {
  done
 }

-function enable_autoscaling () {
+function enable_or_disable_autoscaling () {
  if [[ "${ENABLE_AUTOSCALER}" == "true" ]]; then
    ceph mgr module enable pg_autoscaler
    ceph config set global osd_pool_default_pg_autoscale_mode on
+  else
+    ceph mgr module disable pg_autoscaler
  fi
 }

@ -232,7 +245,7 @@ reweight_osds
 cluster_capacity=0
 if [[ -z "$(ceph osd versions | grep ceph\ version | grep -v nautilus)" ]]; then
  cluster_capacity=$(ceph --cluster "${CLUSTER}" df | grep "TOTAL" | awk '{print $2 substr($3, 1, 1)}' | numfmt --from=iec)
-  enable_autoscaling
+  enable_or_disable_autoscaling
 else
  cluster_capacity=$(ceph --cluster "${CLUSTER}" df | head -n3 | tail -n1 | awk '{print $1 substr($2, 1, 1)}' | numfmt --from=iec)
 fi
@ -253,3 +266,4 @@ ceph --cluster "${CLUSTER}" osd crush tunables {{ .Values.conf.pool.crush.tunabl
 {{- end }}

 wait_for_inactive_pgs
+check_recovery_flags
--- a/ceph-client/values.yaml
+++ b/ceph-client/values.yaml
@ -254,7 +254,7 @@ conf:
  features:
    mds: true
    mgr: true
-    pg_autoscaler: true
+    pg_autoscaler: false
    cluster_flags:
      # List of flags to set or unset separated by spaces
      set: ""