From 791b0de5ee2e6e7ff65ab08bf04f6fc8c2dac6f8 Mon Sep 17 00:00:00 2001 From: "Taylor, Stephen (st053q)" Date: Mon, 23 Nov 2020 14:49:40 -0700 Subject: [PATCH] [ceph-osd] Fix post-apply job failure related to fault tolerance A recent change to wait_for_pods() to allow for fault tolerance appears to be causing wait_for_pgs() to fail and exit the post- apply script prematurely in some cases. The existing wait_for_degraded_objects() logic won't pass until pods and PGs have recovered while the noout flag is set, so the pod and PG waits can simply be removed. Change-Id: I5fd7f422d710c18dee237c0ae97ae1a770606605 --- ceph-osd/Chart.yaml | 2 +- ceph-osd/templates/bin/_post-apply.sh.tpl | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/ceph-osd/Chart.yaml b/ceph-osd/Chart.yaml index d70f71015..1f8436c3e 100644 --- a/ceph-osd/Chart.yaml +++ b/ceph-osd/Chart.yaml @@ -15,6 +15,6 @@ apiVersion: v1 appVersion: v1.0.0 description: OpenStack-Helm Ceph OSD name: ceph-osd -version: 0.1.10 +version: 0.1.11 home: https://github.com/ceph/ceph ... diff --git a/ceph-osd/templates/bin/_post-apply.sh.tpl b/ceph-osd/templates/bin/_post-apply.sh.tpl index 4d2b19054..be9114bd2 100644 --- a/ceph-osd/templates/bin/_post-apply.sh.tpl +++ b/ceph-osd/templates/bin/_post-apply.sh.tpl @@ -148,9 +148,8 @@ function restart_by_rack() { # The pods will not be ready in first 60 seconds. Thus we can reduce # amount of queries to kubernetes. sleep 60 - wait_for_pods $CEPH_NAMESPACE - echo "waiting for inactive pgs after osds restarted from rack $rack" - wait_for_pgs + # Degraded objects won't recover with noout set unless pods come back and + # PGs become healthy, so simply wait for 0 degraded objects wait_for_degraded_objects ceph -s done