From c98ea9ca613cd7aef7f883db4bf129eb8cc0de65 Mon Sep 17 00:00:00 2001 From: Sergiy Markin Date: Wed, 11 Dec 2024 17:59:53 +0000 Subject: [PATCH] [ceph] Fix for ceph-osd pods restart This PS updates ceph-osd pod containers making sure that osd pods are not stuck at deletion. It adds missed lifecycle preStop action for log0runner container. Change-Id: I8d6853a457d3142c33ca6b5449351d9b05ffacda --- ceph-osd/Chart.yaml | 2 +- ceph-osd/templates/bin/_helm-tests.sh.tpl | 9 +++++++-- ceph-osd/templates/bin/osd/_log-runner-stop.sh.tpl | 2 ++ ceph-osd/templates/bin/osd/_log-tail.sh.tpl | 5 ++++- ceph-osd/templates/daemonset-osd.yaml | 5 +++++ releasenotes/notes/ceph-osd.yaml | 1 + 6 files changed, 20 insertions(+), 4 deletions(-) diff --git a/ceph-osd/Chart.yaml b/ceph-osd/Chart.yaml index 85da89020..0d52d074b 100644 --- a/ceph-osd/Chart.yaml +++ b/ceph-osd/Chart.yaml @@ -15,6 +15,6 @@ apiVersion: v1 appVersion: v1.0.0 description: OpenStack-Helm Ceph OSD name: ceph-osd -version: 0.1.55 +version: 0.1.56 home: https://github.com/ceph/ceph ... diff --git a/ceph-osd/templates/bin/_helm-tests.sh.tpl b/ceph-osd/templates/bin/_helm-tests.sh.tpl index cc21c9726..28ea4edc5 100644 --- a/ceph-osd/templates/bin/_helm-tests.sh.tpl +++ b/ceph-osd/templates/bin/_helm-tests.sh.tpl @@ -49,12 +49,18 @@ function check_osd_count() { fi done echo "Caution: noup flag is set. ${count} OSDs in up/new state. Required number of OSDs: ${MIN_OSDS}." + wait_for_degraded_objects + echo "There is no degraded objects found" + ceph -s exit 0 else if [ "${num_osd}" -eq 0 ]; then echo "There are no osds in the cluster" elif [ "${num_in_osds}" -ge "${MIN_OSDS}" ] && [ "${num_up_osds}" -ge "${MIN_OSDS}" ]; then echo "Required number of OSDs (${MIN_OSDS}) are UP and IN status" + wait_for_degraded_objects + echo "There is no degraded objects found" + ceph -s exit 0 else echo "Required number of OSDs (${MIN_OSDS}) are NOT UP and IN status. Cluster shows OSD count=${num_osd}, UP=${num_up_osds}, IN=${num_in_osds}" @@ -70,5 +76,4 @@ while true; do check_osd_count sleep 10 done -wait_for_degraded_objects -ceph -s + diff --git a/ceph-osd/templates/bin/osd/_log-runner-stop.sh.tpl b/ceph-osd/templates/bin/osd/_log-runner-stop.sh.tpl index 646a6bded..eed9dbb75 100644 --- a/ceph-osd/templates/bin/osd/_log-runner-stop.sh.tpl +++ b/ceph-osd/templates/bin/osd/_log-runner-stop.sh.tpl @@ -18,6 +18,8 @@ set -ex source /tmp/utils-resolveLocations.sh +touch /tmp/ceph-log-runner.stop + TAIL_PID="$(cat /tmp/ceph-log-runner.pid)" while kill -0 ${TAIL_PID} >/dev/null 2>&1; do diff --git a/ceph-osd/templates/bin/osd/_log-tail.sh.tpl b/ceph-osd/templates/bin/osd/_log-tail.sh.tpl index f8c4c8e10..541aa5fbf 100644 --- a/ceph-osd/templates/bin/osd/_log-tail.sh.tpl +++ b/ceph-osd/templates/bin/osd/_log-tail.sh.tpl @@ -27,7 +27,10 @@ function tail_file () { tail_pid=$! echo $tail_pid > /tmp/ceph-log-runner.pid wait $tail_pid - sleep 10 + if [ -f /tmp/ceph-log-runner.stop ]; then + keep_running=false + fi + sleep 30 done } diff --git a/ceph-osd/templates/daemonset-osd.yaml b/ceph-osd/templates/daemonset-osd.yaml index 41d6b7b07..565f00a79 100644 --- a/ceph-osd/templates/daemonset-osd.yaml +++ b/ceph-osd/templates/daemonset-osd.yaml @@ -317,6 +317,11 @@ spec: value: {{ .Values.logging.osd_id.timeout | quote }} command: - /tmp/log-tail.sh + lifecycle: + preStop: + exec: + command: + - /tmp/log-runner-stop.sh volumeMounts: - name: pod-tmp mountPath: /tmp diff --git a/releasenotes/notes/ceph-osd.yaml b/releasenotes/notes/ceph-osd.yaml index ca681f9ea..165a9c522 100644 --- a/releasenotes/notes/ceph-osd.yaml +++ b/releasenotes/notes/ceph-osd.yaml @@ -56,4 +56,5 @@ ceph-osd: - 0.1.53 Update ceph-daemon to be able to use tini init system - 0.1.54 Remove use of tini for ceph-daemon - 0.1.55 Update ceph-osd pod containers to make sure OSD pods are properly terminated at restart + - 0.1.56 Add preStop lifecycle script to log-runner ...