From c98ea9ca613cd7aef7f883db4bf129eb8cc0de65 Mon Sep 17 00:00:00 2001
From: Sergiy Markin <smarkin@mirantis.com>
Date: Wed, 11 Dec 2024 17:59:53 +0000
Subject: [PATCH] [ceph] Fix for ceph-osd pods restart

This PS updates ceph-osd pod containers making sure
that osd pods are not stuck at deletion.

It adds missed lifecycle preStop action for log0runner container.

Change-Id: I8d6853a457d3142c33ca6b5449351d9b05ffacda
---
 ceph-osd/Chart.yaml                                | 2 +-
 ceph-osd/templates/bin/_helm-tests.sh.tpl          | 9 +++++++--
 ceph-osd/templates/bin/osd/_log-runner-stop.sh.tpl | 2 ++
 ceph-osd/templates/bin/osd/_log-tail.sh.tpl        | 5 ++++-
 ceph-osd/templates/daemonset-osd.yaml              | 5 +++++
 releasenotes/notes/ceph-osd.yaml                   | 1 +
 6 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/ceph-osd/Chart.yaml b/ceph-osd/Chart.yaml
index 85da89020..0d52d074b 100644
--- a/ceph-osd/Chart.yaml
+++ b/ceph-osd/Chart.yaml
@@ -15,6 +15,6 @@ apiVersion: v1
 appVersion: v1.0.0
 description: OpenStack-Helm Ceph OSD
 name: ceph-osd
-version: 0.1.55
+version: 0.1.56
 home: https://github.com/ceph/ceph
 ...
diff --git a/ceph-osd/templates/bin/_helm-tests.sh.tpl b/ceph-osd/templates/bin/_helm-tests.sh.tpl
index cc21c9726..28ea4edc5 100644
--- a/ceph-osd/templates/bin/_helm-tests.sh.tpl
+++ b/ceph-osd/templates/bin/_helm-tests.sh.tpl
@@ -49,12 +49,18 @@ function check_osd_count() {
       fi
     done
     echo "Caution: noup flag is set. ${count} OSDs in up/new state. Required number of OSDs: ${MIN_OSDS}."
+    wait_for_degraded_objects
+    echo "There is  no degraded objects found"
+    ceph -s
     exit 0
   else
     if [ "${num_osd}" -eq 0 ]; then
       echo "There are no osds in the cluster"
     elif [ "${num_in_osds}" -ge "${MIN_OSDS}" ] && [ "${num_up_osds}" -ge "${MIN_OSDS}"  ]; then
       echo "Required number of OSDs (${MIN_OSDS}) are UP and IN status"
+      wait_for_degraded_objects
+      echo "There is  no degraded objects found"
+      ceph -s
       exit 0
     else
       echo "Required number of OSDs (${MIN_OSDS}) are NOT UP and IN status. Cluster shows OSD count=${num_osd}, UP=${num_up_osds}, IN=${num_in_osds}"
@@ -70,5 +76,4 @@ while true; do
   check_osd_count
   sleep 10
 done
-wait_for_degraded_objects
-ceph -s
+
diff --git a/ceph-osd/templates/bin/osd/_log-runner-stop.sh.tpl b/ceph-osd/templates/bin/osd/_log-runner-stop.sh.tpl
index 646a6bded..eed9dbb75 100644
--- a/ceph-osd/templates/bin/osd/_log-runner-stop.sh.tpl
+++ b/ceph-osd/templates/bin/osd/_log-runner-stop.sh.tpl
@@ -18,6 +18,8 @@ set -ex
 
 source /tmp/utils-resolveLocations.sh
 
+touch /tmp/ceph-log-runner.stop
+
 TAIL_PID="$(cat /tmp/ceph-log-runner.pid)"
 while kill -0 ${TAIL_PID} >/dev/null 2>&1;
 do
diff --git a/ceph-osd/templates/bin/osd/_log-tail.sh.tpl b/ceph-osd/templates/bin/osd/_log-tail.sh.tpl
index f8c4c8e10..541aa5fbf 100644
--- a/ceph-osd/templates/bin/osd/_log-tail.sh.tpl
+++ b/ceph-osd/templates/bin/osd/_log-tail.sh.tpl
@@ -27,7 +27,10 @@ function tail_file () {
     tail_pid=$!
     echo $tail_pid > /tmp/ceph-log-runner.pid
     wait $tail_pid
-    sleep 10
+    if [ -f /tmp/ceph-log-runner.stop ]; then
+      keep_running=false
+    fi
+    sleep 30
   done
 }
 
diff --git a/ceph-osd/templates/daemonset-osd.yaml b/ceph-osd/templates/daemonset-osd.yaml
index 41d6b7b07..565f00a79 100644
--- a/ceph-osd/templates/daemonset-osd.yaml
+++ b/ceph-osd/templates/daemonset-osd.yaml
@@ -317,6 +317,11 @@ spec:
               value: {{ .Values.logging.osd_id.timeout | quote }}
           command:
             - /tmp/log-tail.sh
+          lifecycle:
+            preStop:
+              exec:
+                command:
+                  - /tmp/log-runner-stop.sh
           volumeMounts:
             - name: pod-tmp
               mountPath: /tmp
diff --git a/releasenotes/notes/ceph-osd.yaml b/releasenotes/notes/ceph-osd.yaml
index ca681f9ea..165a9c522 100644
--- a/releasenotes/notes/ceph-osd.yaml
+++ b/releasenotes/notes/ceph-osd.yaml
@@ -56,4 +56,5 @@ ceph-osd:
   - 0.1.53 Update ceph-daemon to be able to use tini init system
   - 0.1.54 Remove use of tini for ceph-daemon
   - 0.1.55 Update ceph-osd pod containers to make sure OSD pods are properly terminated at restart
+  - 0.1.56 Add preStop lifecycle script to log-runner
 ...