From cb73c61b4ee2f015b99b9a8809a822475f2f5900 Mon Sep 17 00:00:00 2001
From: Stephen Taylor <stephen.taylor.1@att.com>
Date: Tue, 23 Nov 2021 11:01:46 -0700
Subject: [PATCH] [ceph-osd] Remove wait for misplaced objects during OSD
 restarts

The wait for misplaced objects during the ceph-osd post-apply job
was added to prevent I/O disruption in the case where misplaced
objects cause multiple replicas in common failure domains. This
concern is only valid before OSD restarts begin because OSD
failures during the restart process won't cause replicas that
violate replication rules to appear elsewhere.

This change keeps the wait for misplaced objects prior to beginning
OSD restarts and removes it during those restarts. The wait during
OSD restarts now only waits for degraded objects to be recovered
before proceeding to the next failure domain.

Change-Id: Ic82c67b43089c7a2b45995d1fd9c285d5c0e7cbc
---
 ceph-osd/Chart.yaml                       |  2 +-
 ceph-osd/templates/bin/_post-apply.sh.tpl | 17 ++++++++++++++---
 releasenotes/notes/ceph-osd.yaml          |  1 +
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/ceph-osd/Chart.yaml b/ceph-osd/Chart.yaml
index 2a71f42d5..263248fa4 100644
--- a/ceph-osd/Chart.yaml
+++ b/ceph-osd/Chart.yaml
@@ -15,6 +15,6 @@ apiVersion: v1
 appVersion: v1.0.0
 description: OpenStack-Helm Ceph OSD
 name: ceph-osd
-version: 0.1.33
+version: 0.1.34
 home: https://github.com/ceph/ceph
 ...
diff --git a/ceph-osd/templates/bin/_post-apply.sh.tpl b/ceph-osd/templates/bin/_post-apply.sh.tpl
index ac71cbc66..59dd7f8e0 100644
--- a/ceph-osd/templates/bin/_post-apply.sh.tpl
+++ b/ceph-osd/templates/bin/_post-apply.sh.tpl
@@ -115,10 +115,21 @@ function wait_for_pgs () {
   done
 }
 
+function wait_for_degraded_objects () {
+  echo "#### Start: Checking for degraded objects ####"
+
+  # Loop until no degraded objects
+    while [[ ! -z "`ceph --cluster ${CLUSTER} -s | grep 'degraded'`" ]]
+    do
+      sleep 3
+      ceph -s
+    done
+}
+
 function wait_for_degraded_and_misplaced_objects () {
   echo "#### Start: Checking for degraded and misplaced objects ####"
 
-  # Loop until no degraded objects
+  # Loop until no degraded or misplaced objects
     while [[ ! -z "`ceph --cluster ${CLUSTER} -s | grep 'degraded\|misplaced'`" ]]
     do
       sleep 3
@@ -150,7 +161,7 @@ function restart_by_rack() {
      sleep 60
      # Degraded objects won't recover with noout set unless pods come back and
      # PGs become healthy, so simply wait for 0 degraded objects
-     wait_for_degraded_and_misplaced_objects
+     wait_for_degraded_objects
      ceph -s
   done
 }
@@ -177,7 +188,7 @@ echo "Latest revision of the helm chart(s) is : $max_release"
 
 if [[ $max_release -gt 1  ]]; then
   if [[  $require_upgrade -gt 0 ]]; then
-    echo "waiting for inactive pgs and degraded obejcts before upgrade"
+    echo "waiting for inactive pgs and degraded objects before upgrade"
     wait_for_pgs
     wait_for_degraded_and_misplaced_objects
     ceph -s
diff --git a/releasenotes/notes/ceph-osd.yaml b/releasenotes/notes/ceph-osd.yaml
index 36048dbd6..f93463d6d 100644
--- a/releasenotes/notes/ceph-osd.yaml
+++ b/releasenotes/notes/ceph-osd.yaml
@@ -34,4 +34,5 @@ ceph-osd:
   - 0.1.31 Helm 3 - Fix Job labels
   - 0.1.32 Update htk requirements
   - 0.1.33 Update log-runner container for MAC
+  - 0.1.34 Remove wait for misplaced objects during OSD restarts
 ...