Merge "Make ceph-client helm test more PG specific"
This commit is contained in:
commit
e3142120cb
@ -15,6 +15,6 @@ apiVersion: v1
|
||||
appVersion: v1.0.0
|
||||
description: OpenStack-Helm Ceph Client
|
||||
name: ceph-client
|
||||
version: 0.1.14
|
||||
version: 0.1.15
|
||||
home: https://github.com/ceph/ceph-client
|
||||
...
|
||||
|
@ -246,6 +246,62 @@ function pool_failuredomain_validation() {
|
||||
done
|
||||
}
|
||||
|
||||
function check_transient_pgs_file() {
|
||||
current_time=$1
|
||||
pg_failed_list=()
|
||||
|
||||
# Remove the lines NOT having the word "current" as these are the old
|
||||
# PGs that are no longer in transition.
|
||||
sed -i '/current/!d' ${transient_pgs_file}
|
||||
|
||||
# For all remaining lines (PGs currently inactive), check for PGs which
|
||||
# are older than the limit.
|
||||
IFS=$'\n' read -d '' -r -a lines < ${transient_pgs_file} || true
|
||||
for pg_data in "${lines[@]}"; do
|
||||
pg=$(echo ${pg_data} | awk '{print $1}')
|
||||
pg_ts=$(echo ${pg_data} | awk '{print $2}')
|
||||
if [[ $((${current_time} - ${pg_ts})) -gt ${pg_inactive_timeout} ]]; then
|
||||
pg_failed_list+=("${pg}")
|
||||
fi
|
||||
done
|
||||
|
||||
# Remove the current designation for all PGs, as we no longer need it
|
||||
# for this check.
|
||||
sed -i 's/ current//g' ${transient_pgs_file}
|
||||
|
||||
cat ${transient_pgs_file}
|
||||
if [[ ${#pg_failed_list[@]} -gt 0 ]]; then
|
||||
echo "The following PGs have been in a transient state for longer than ${pg_inactive_timeout} seconds:"
|
||||
echo ${pg_failed_list[*]}
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
function update_transient_pgs_file() {
|
||||
pg=$1
|
||||
current_ts=$2
|
||||
|
||||
pg_data=$(grep "${pg} " ${transient_pgs_file} || true)
|
||||
if [[ "${pg_data}" == "" ]]; then
|
||||
echo "${pg} ${current_ts} current" >> ${transient_pgs_file}
|
||||
else
|
||||
# Add the word "current" to the end of the line which has this PG
|
||||
sed -i '/^'"${pg} "'/s/$/ current/' ${transient_pgs_file}
|
||||
fi
|
||||
}
|
||||
|
||||
function check_transient_pgs() {
|
||||
local -n pg_array=$1
|
||||
|
||||
# Use a temporary transient PGs file to track the amount of time PGs
|
||||
# are spending in a transitional state.
|
||||
now=$(date +%s)
|
||||
for pg in "${pg_array[@]}"; do
|
||||
update_transient_pgs_file ${pg} ${now}
|
||||
done
|
||||
check_transient_pgs_file ${now}
|
||||
}
|
||||
|
||||
function check_pgs() {
|
||||
pgs_transitioning=false
|
||||
|
||||
@ -260,6 +316,9 @@ function check_pgs() {
|
||||
echo ${stuck_pgs[*]}
|
||||
# Not a critical error - yet
|
||||
pgs_transitioning=true
|
||||
|
||||
# Check to see if any transitioning PG has been stuck for too long
|
||||
check_transient_pgs stuck_pgs
|
||||
else
|
||||
# Examine the PGs that have non-active states. Consider those PGs that
|
||||
# are in a "premerge" state to be similar to active. "premerge" PGs may
|
||||
@ -268,10 +327,10 @@ function check_pgs() {
|
||||
|
||||
# If the inactive pgs file is non-empty, there are some inactive pgs in the cluster.
|
||||
inactive_pgs=(`cat ${inactive_pgs_file} | awk -F "\"" '/pgid/{print $4}'`)
|
||||
echo "There is at least one inactive pg in the cluster: "
|
||||
echo "This is the list of inactive pgs in the cluster: "
|
||||
echo ${inactive_pgs[*]}
|
||||
|
||||
echo "Very likely the cluster is rebalancing or recovering some PG's. Checking..."
|
||||
echo "Checking to see if the cluster is rebalancing or recovering some PG's..."
|
||||
|
||||
# Check for PGs that are down. These are critical errors.
|
||||
down_pgs=(`cat ${inactive_pgs_file} | grep -B1 'down' | awk -F "\"" '/pgid/{print $4}'`)
|
||||
@ -311,6 +370,9 @@ function check_pgs() {
|
||||
echo "This is normal but will wait a while to verify the PGs are not stuck in a transient state."
|
||||
# not critical, just wait
|
||||
pgs_transitioning=true
|
||||
|
||||
# Check to see if any transitioning PG has been stuck for too long
|
||||
check_transient_pgs transient_pgs
|
||||
fi
|
||||
fi
|
||||
}
|
||||
@ -319,9 +381,11 @@ function pg_validation() {
|
||||
retries=0
|
||||
time_between_retries=3
|
||||
max_retries=60
|
||||
pg_inactive_timeout=30
|
||||
pgs_transitioning=false
|
||||
stuck_pgs_file=$(mktemp -p /tmp)
|
||||
inactive_pgs_file=$(mktemp -p /tmp)
|
||||
transient_pgs_file=$(mktemp -p /tmp)
|
||||
|
||||
# Check this over a period of retries. Fail/stop if any critical errors found.
|
||||
while check_pgs && [[ "${pgs_transitioning}" == "true" ]] && [[ retries -lt ${max_retries} ]]; do
|
||||
@ -330,11 +394,11 @@ function pg_validation() {
|
||||
((retries=retries+1))
|
||||
done
|
||||
|
||||
# If peering PGs haven't gone active after retries have expired, fail
|
||||
# Check if transitioning PGs have gone active after retries have expired
|
||||
if [[ retries -ge ${max_retries} ]]; then
|
||||
((timeout_sec=${time_between_retries}*${max_retries}))
|
||||
echo "Some PGs have not become active or have been stuck after ${timeout_sec} seconds. Exiting..."
|
||||
exit 1
|
||||
echo "Some PGs have not become active after ${timeout_sec} seconds. Exiting..."
|
||||
# This is ok, as the autoscaler might still be adjusting the PGs.
|
||||
fi
|
||||
}
|
||||
|
||||
|
@ -15,4 +15,5 @@ ceph-client:
|
||||
- 0.1.12 Disable autoscaling before pools are created
|
||||
- 0.1.13 Fix ceph-client helm test
|
||||
- 0.1.14 Allow Ceph RBD pool job to leave failed pods
|
||||
- 0.1.15 Make ceph-client helm test more PG specific
|
||||
...
|
||||
|
Loading…
x
Reference in New Issue
Block a user