[ceph-osd] Add a check for misplaced objects to the post-apply job

OSD failures during an update can cause degraded and misplaced objects. The post-apply job restarts OSDs in failure domain batches in order to accomplish the restarts efficiently. There is already a wait for degraded objects to ensure that OSDs are not restarted on degraded PGs, but misplaced objects could mean that multiple object replicas exist in the same failure domain, so the job should wait for those to recover as well before restarting OSDs in order to avoid potential disruption under these failure conditions. Change-Id: I39606e388a9a1d3a4e9c547de56aac4fc5606ea2
2020-11-30 10:17:40 -07:00 · 2020-11-30 10:17:40 -07:00 · e37d1fc2ab
commit e37d1fc2ab
parent 3205c8b778
2 changed files with 6 additions and 6 deletions
--- a/ceph-osd/Chart.yaml
+++ b/ceph-osd/Chart.yaml
@ -15,6 +15,6 @@ apiVersion: v1
 appVersion: v1.0.0
 description: OpenStack-Helm Ceph OSD
 name: ceph-osd
-version: 0.1.11
+version: 0.1.12
 home: https://github.com/ceph/ceph
 ...
--- a/ceph-osd/templates/bin/_post-apply.sh.tpl
+++ b/ceph-osd/templates/bin/_post-apply.sh.tpl
@ -115,11 +115,11 @@ function wait_for_pgs () {
  done
 }

-function wait_for_degraded_objects () {
-  echo "#### Start: Checking for degraded objects ####"
+function wait_for_degraded_and_misplaced_objects () {
+  echo "#### Start: Checking for degraded and misplaced objects ####"

  # Loop until no degraded objects
-    while [[ ! -z "`ceph --cluster ${CLUSTER} -s | grep degraded`" ]]
+    while [[ ! -z "`ceph --cluster ${CLUSTER} -s | grep 'degraded\|misplaced'`" ]]
    do
      sleep 3
      ceph -s
@ -150,7 +150,7 @@ function restart_by_rack() {
     sleep 60
     # Degraded objects won't recover with noout set unless pods come back and
     # PGs become healthy, so simply wait for 0 degraded objects
-     wait_for_degraded_objects
+     wait_for_degraded_and_misplaced_objects
     ceph -s
  done
 }
@ -179,7 +179,7 @@ if [[ $max_release -gt 1  ]]; then
  if [[  $require_upgrade -gt 0 ]]; then
    echo "waiting for inactive pgs and degraded obejcts before upgrade"
    wait_for_pgs
-    wait_for_degraded_objects
+    wait_for_degraded_and_misplaced_objects
    ceph -s
    ceph osd "set" noout
    echo "lets restart the osds rack by rack"