[ceph-client] Fix a helm test issue and disable PG autoscaler
Currently the Ceph helm tests pass when the deployed Ceph cluster is unhealthy. This change expands the cluster status testing logic to pass when all PGs are active and fail if any PG is inactive. The PG autoscaler is currently causing the deployment to deploy unhealthy Ceph clusters. This change also disables it. It should be re-enabled once those issues are resolved. Change-Id: Iea1ff5006fc00e4570cf67c6af5ef6746a538058
This commit is contained in:
parent
8633b93548
commit
84f1557566
@ -24,7 +24,37 @@ function check_cluster_status() {
|
||||
if [ "x${ceph_health_status}" == "xHEALTH_OK" ]; then
|
||||
echo "Ceph status is HEALTH_OK"
|
||||
else
|
||||
echo "Ceph cluster status is NOT HEALTH_OK."
|
||||
echo "Ceph cluster status is not HEALTH_OK, checking PG states"
|
||||
retries=0
|
||||
# If all PGs are active, pass
|
||||
# This grep is just as robust as jq and is Ceph-version agnostic unlike jq
|
||||
while [[ $(ceph pg ls -f json-pretty | grep '"state":' | grep -v "active") ]] && [[ retries -lt 60 ]]; do
|
||||
# If all inactive PGs are peering, wait for peering to complete
|
||||
# Run 'ceph pg ls' again before failing in case PG states have changed
|
||||
if [[ $(ceph pg ls -f json-pretty | grep '"state":' | grep -v -e "active" -e "peering") ]]; then
|
||||
# If inactive PGs aren't peering, fail
|
||||
echo "Failure, found inactive PGs that aren't peering"
|
||||
exit 1
|
||||
fi
|
||||
sleep 3
|
||||
((retries=retries+1))
|
||||
done
|
||||
# If peering PGs haven't gone active after retries have expired, fail
|
||||
if [[ retries -ge 60 ]]; then
|
||||
echo "PGs appear to be stuck peering"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
function check_recovery_flags() {
|
||||
echo "### Start: Checking for flags that will prevent recovery"
|
||||
|
||||
# Ensure there are no flags set that will prevent recovery of degraded PGs
|
||||
if [[ $(ceph osd stat | grep "norecover\|nobackfill\|norebalance") ]]; then
|
||||
ceph osd stat
|
||||
echo "Flags are set that prevent recovery of degraded PGs"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
@ -257,3 +287,4 @@ pool_validation
|
||||
pool_failuredomain_validation
|
||||
check_failure_domain_count_per_pool
|
||||
check_cluster_status
|
||||
check_recovery_flags
|
||||
|
@ -46,6 +46,17 @@ function wait_for_inactive_pgs () {
|
||||
fi
|
||||
}
|
||||
|
||||
function check_recovery_flags () {
|
||||
echo "### Start: Checking for flags that will prevent recovery"
|
||||
|
||||
# Ensure there are no flags set that will prevent recovery of degraded PGs
|
||||
if [[ $(ceph osd stat | grep "norecover\|nobackfill\|norebalance") ]]; then
|
||||
ceph osd stat
|
||||
echo "Flags are set that prevent recovery of degraded PGs"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
function check_osd_count() {
|
||||
echo "#### Start: Checking OSD count ####"
|
||||
noup_flag=$(ceph osd stat | awk '/noup/ {print $2}')
|
||||
@ -119,10 +130,12 @@ function reweight_osds () {
|
||||
done
|
||||
}
|
||||
|
||||
function enable_autoscaling () {
|
||||
function enable_or_disable_autoscaling () {
|
||||
if [[ "${ENABLE_AUTOSCALER}" == "true" ]]; then
|
||||
ceph mgr module enable pg_autoscaler
|
||||
ceph config set global osd_pool_default_pg_autoscale_mode on
|
||||
else
|
||||
ceph mgr module disable pg_autoscaler
|
||||
fi
|
||||
}
|
||||
|
||||
@ -232,7 +245,7 @@ reweight_osds
|
||||
cluster_capacity=0
|
||||
if [[ -z "$(ceph osd versions | grep ceph\ version | grep -v nautilus)" ]]; then
|
||||
cluster_capacity=$(ceph --cluster "${CLUSTER}" df | grep "TOTAL" | awk '{print $2 substr($3, 1, 1)}' | numfmt --from=iec)
|
||||
enable_autoscaling
|
||||
enable_or_disable_autoscaling
|
||||
else
|
||||
cluster_capacity=$(ceph --cluster "${CLUSTER}" df | head -n3 | tail -n1 | awk '{print $1 substr($2, 1, 1)}' | numfmt --from=iec)
|
||||
fi
|
||||
@ -253,3 +266,4 @@ ceph --cluster "${CLUSTER}" osd crush tunables {{ .Values.conf.pool.crush.tunabl
|
||||
{{- end }}
|
||||
|
||||
wait_for_inactive_pgs
|
||||
check_recovery_flags
|
||||
|
@ -254,7 +254,7 @@ conf:
|
||||
features:
|
||||
mds: true
|
||||
mgr: true
|
||||
pg_autoscaler: true
|
||||
pg_autoscaler: false
|
||||
cluster_flags:
|
||||
# List of flags to set or unset separated by spaces
|
||||
set: ""
|
||||
|
Loading…
Reference in New Issue
Block a user