From 791b0de5ee2e6e7ff65ab08bf04f6fc8c2dac6f8 Mon Sep 17 00:00:00 2001
From: "Taylor, Stephen (st053q)" <st053q@att.com>
Date: Mon, 23 Nov 2020 14:49:40 -0700
Subject: [PATCH] [ceph-osd] Fix post-apply job failure related to fault
 tolerance

A recent change to wait_for_pods() to allow for fault tolerance
appears to be causing wait_for_pgs() to fail and exit the post-
apply script prematurely in some cases. The existing
wait_for_degraded_objects() logic won't pass until pods and PGs
have recovered while the noout flag is set, so the pod and PG
waits can simply be removed.

Change-Id: I5fd7f422d710c18dee237c0ae97ae1a770606605
---
 ceph-osd/Chart.yaml                       | 2 +-
 ceph-osd/templates/bin/_post-apply.sh.tpl | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/ceph-osd/Chart.yaml b/ceph-osd/Chart.yaml
index d70f71015..1f8436c3e 100644
--- a/ceph-osd/Chart.yaml
+++ b/ceph-osd/Chart.yaml
@@ -15,6 +15,6 @@ apiVersion: v1
 appVersion: v1.0.0
 description: OpenStack-Helm Ceph OSD
 name: ceph-osd
-version: 0.1.10
+version: 0.1.11
 home: https://github.com/ceph/ceph
 ...
diff --git a/ceph-osd/templates/bin/_post-apply.sh.tpl b/ceph-osd/templates/bin/_post-apply.sh.tpl
index 4d2b19054..be9114bd2 100644
--- a/ceph-osd/templates/bin/_post-apply.sh.tpl
+++ b/ceph-osd/templates/bin/_post-apply.sh.tpl
@@ -148,9 +148,8 @@ function restart_by_rack() {
      # The pods will not be ready in first 60 seconds. Thus we can reduce
      # amount of queries to kubernetes.
      sleep 60
-     wait_for_pods $CEPH_NAMESPACE
-     echo "waiting for inactive pgs after osds restarted from rack $rack"
-     wait_for_pgs
+     # Degraded objects won't recover with noout set unless pods come back and
+     # PGs become healthy, so simply wait for 0 degraded objects
      wait_for_degraded_objects
      ceph -s
   done