[ceph-client] Fix a helm test issue and disable PG autoscaler

Currently the Ceph helm tests pass when the deployed Ceph cluster
is unhealthy. This change expands the cluster status testing
logic to pass when all PGs are active and fail if any PG is
inactive.

The PG autoscaler is currently causing the deployment to deploy
unhealthy Ceph clusters. This change also disables it. It should
be re-enabled once those issues are resolved.

Change-Id: Iea1ff5006fc00e4570cf67c6af5ef6746a538058
This commit is contained in:
Stephen Taylor 2020-07-28 15:56:28 +00:00
parent 8633b93548
commit 84f1557566
3 changed files with 49 additions and 4 deletions

View File

@ -24,7 +24,37 @@ function check_cluster_status() {
if [ "x${ceph_health_status}" == "xHEALTH_OK" ]; then
echo "Ceph status is HEALTH_OK"
else
echo "Ceph cluster status is NOT HEALTH_OK."
echo "Ceph cluster status is not HEALTH_OK, checking PG states"
retries=0
# If all PGs are active, pass
# This grep is just as robust as jq and is Ceph-version agnostic unlike jq
while [[ $(ceph pg ls -f json-pretty | grep '"state":' | grep -v "active") ]] && [[ retries -lt 60 ]]; do
# If all inactive PGs are peering, wait for peering to complete
# Run 'ceph pg ls' again before failing in case PG states have changed
if [[ $(ceph pg ls -f json-pretty | grep '"state":' | grep -v -e "active" -e "peering") ]]; then
# If inactive PGs aren't peering, fail
echo "Failure, found inactive PGs that aren't peering"
exit 1
fi
sleep 3
((retries=retries+1))
done
# If peering PGs haven't gone active after retries have expired, fail
if [[ retries -ge 60 ]]; then
echo "PGs appear to be stuck peering"
exit 1
fi
fi
}
function check_recovery_flags() {
echo "### Start: Checking for flags that will prevent recovery"
# Ensure there are no flags set that will prevent recovery of degraded PGs
if [[ $(ceph osd stat | grep "norecover\|nobackfill\|norebalance") ]]; then
ceph osd stat
echo "Flags are set that prevent recovery of degraded PGs"
exit 1
fi
}
@ -257,3 +287,4 @@ pool_validation
pool_failuredomain_validation
check_failure_domain_count_per_pool
check_cluster_status
check_recovery_flags

View File

@ -46,6 +46,17 @@ function wait_for_inactive_pgs () {
fi
}
function check_recovery_flags () {
echo "### Start: Checking for flags that will prevent recovery"
# Ensure there are no flags set that will prevent recovery of degraded PGs
if [[ $(ceph osd stat | grep "norecover\|nobackfill\|norebalance") ]]; then
ceph osd stat
echo "Flags are set that prevent recovery of degraded PGs"
exit 1
fi
}
function check_osd_count() {
echo "#### Start: Checking OSD count ####"
noup_flag=$(ceph osd stat | awk '/noup/ {print $2}')
@ -119,10 +130,12 @@ function reweight_osds () {
done
}
function enable_autoscaling () {
function enable_or_disable_autoscaling () {
if [[ "${ENABLE_AUTOSCALER}" == "true" ]]; then
ceph mgr module enable pg_autoscaler
ceph config set global osd_pool_default_pg_autoscale_mode on
else
ceph mgr module disable pg_autoscaler
fi
}
@ -232,7 +245,7 @@ reweight_osds
cluster_capacity=0
if [[ -z "$(ceph osd versions | grep ceph\ version | grep -v nautilus)" ]]; then
cluster_capacity=$(ceph --cluster "${CLUSTER}" df | grep "TOTAL" | awk '{print $2 substr($3, 1, 1)}' | numfmt --from=iec)
enable_autoscaling
enable_or_disable_autoscaling
else
cluster_capacity=$(ceph --cluster "${CLUSTER}" df | head -n3 | tail -n1 | awk '{print $1 substr($2, 1, 1)}' | numfmt --from=iec)
fi
@ -253,3 +266,4 @@ ceph --cluster "${CLUSTER}" osd crush tunables {{ .Values.conf.pool.crush.tunabl
{{- end }}
wait_for_inactive_pgs
check_recovery_flags

View File

@ -254,7 +254,7 @@ conf:
features:
mds: true
mgr: true
pg_autoscaler: true
pg_autoscaler: false
cluster_flags:
# List of flags to set or unset separated by spaces
set: ""