From 76fb2562c60e829109d176800f021d64de5c5529 Mon Sep 17 00:00:00 2001 From: Stephen Taylor Date: Mon, 4 Apr 2022 13:35:49 -0600 Subject: [PATCH] [ceph-osd] Allow for unconditional OSD restart This change allows OSDs to be restarted unconditionally by the ceph-osd chart. This can be useful in upgrade scenarios where ceph-osd pods are unhealthy during the upgrade. Change-Id: I6de98db2b4eb1d76411e1dbffa65c263de3aecee --- ceph-osd/Chart.yaml | 2 +- ceph-osd/templates/bin/_post-apply.sh.tpl | 54 +++++++++++++---------- ceph-osd/templates/job-post-apply.yaml | 2 + ceph-osd/values.yaml | 5 +++ releasenotes/notes/ceph-osd.yaml | 1 + 5 files changed, 39 insertions(+), 25 deletions(-) diff --git a/ceph-osd/Chart.yaml b/ceph-osd/Chart.yaml index e50427f68..be0c75bc7 100644 --- a/ceph-osd/Chart.yaml +++ b/ceph-osd/Chart.yaml @@ -15,6 +15,6 @@ apiVersion: v1 appVersion: v1.0.0 description: OpenStack-Helm Ceph OSD name: ceph-osd -version: 0.1.38 +version: 0.1.39 home: https://github.com/ceph/ceph ... diff --git a/ceph-osd/templates/bin/_post-apply.sh.tpl b/ceph-osd/templates/bin/_post-apply.sh.tpl index c8a50202d..74229676c 100644 --- a/ceph-osd/templates/bin/_post-apply.sh.tpl +++ b/ceph-osd/templates/bin/_post-apply.sh.tpl @@ -188,31 +188,37 @@ done echo "Latest revision of the helm chart(s) is : $max_release" -if [[ $max_release -gt 1 ]]; then - if [[ $require_upgrade -gt 0 ]]; then - if [[ "$DISRUPTIVE_OSD_RESTART" == "true" ]]; then - echo "restarting all osds simultaneously" - kubectl -n $CEPH_NAMESPACE delete pod -l component=osd - sleep 60 - echo "waiting for pgs to become active and for degraded objects to recover" - wait_for_pgs - wait_for_degraded_objects - ceph -s - else - echo "waiting for inactive pgs and degraded objects before upgrade" - wait_for_pgs - wait_for_degraded_and_misplaced_objects - ceph -s - ceph osd "set" noout - echo "lets restart the osds rack by rack" - restart_by_rack - ceph osd "unset" noout +# If flags are set that will prevent recovery, don't restart OSDs +ceph -s | grep "noup\|noin\|nobackfill\|norebalance\|norecover" > /dev/null +if [[ $? -ne 0 ]]; then + if [[ "$UNCONDITIONAL_OSD_RESTART" == "true" ]] || [[ $max_release -gt 1 ]]; then + if [[ "$UNCONDITIONAL_OSD_RESTART" == "true" ]] || [[ $require_upgrade -gt 0 ]]; then + if [[ "$DISRUPTIVE_OSD_RESTART" == "true" ]]; then + echo "restarting all osds simultaneously" + kubectl -n $CEPH_NAMESPACE delete pod -l component=osd + sleep 60 + echo "waiting for pgs to become active and for degraded objects to recover" + wait_for_pgs + wait_for_degraded_objects + ceph -s + else + echo "waiting for inactive pgs and degraded objects before upgrade" + wait_for_pgs + wait_for_degraded_and_misplaced_objects + ceph -s + ceph osd "set" noout + echo "lets restart the osds rack by rack" + restart_by_rack + ceph osd "unset" noout + fi fi - fi - #lets check all the ceph-osd daemonsets - echo "checking DS" - check_ds + #lets check all the ceph-osd daemonsets + echo "checking DS" + check_ds + else + echo "No revisions found for upgrade" + fi else - echo "No revisions found for upgrade" + echo "Skipping OSD restarts because flags are set that would prevent recovery" fi diff --git a/ceph-osd/templates/job-post-apply.yaml b/ceph-osd/templates/job-post-apply.yaml index 6e9a34707..393769d95 100644 --- a/ceph-osd/templates/job-post-apply.yaml +++ b/ceph-osd/templates/job-post-apply.yaml @@ -104,6 +104,8 @@ spec: value: {{ .Values.conf.ceph.target.required_percent_of_osds | ceil | quote }} - name: DISRUPTIVE_OSD_RESTART value: {{ .Values.conf.storage.disruptive_osd_restart | quote }} + - name: UNCONDITIONAL_OSD_RESTART + value: {{ .Values.conf.storage.unconditional_osd_restart | quote }} command: - /tmp/post-apply.sh volumeMounts: diff --git a/ceph-osd/values.yaml b/ceph-osd/values.yaml index 09c41e985..ad87e2a15 100644 --- a/ceph-osd/values.yaml +++ b/ceph-osd/values.yaml @@ -293,6 +293,11 @@ conf: # OSD restarts more quickly with disruption. disruptive_osd_restart: "false" + # The post-apply job will try to determine if OSDs need to be restarted and + # only restart them if necessary. Set this value to "true" to restart OSDs + # unconditionally. + unconditional_osd_restart: "false" + # NOTE(portdirect): for heterogeneous clusters the overrides section can be used to define # OSD pods that will be deployed upon specifc nodes. # overrides: diff --git a/releasenotes/notes/ceph-osd.yaml b/releasenotes/notes/ceph-osd.yaml index dd319eafb..a66f6e597 100644 --- a/releasenotes/notes/ceph-osd.yaml +++ b/releasenotes/notes/ceph-osd.yaml @@ -39,4 +39,5 @@ ceph-osd: - 0.1.36 Add OSD device location pre-check - 0.1.37 Add a disruptive OSD restart to the post-apply job - 0.1.38 Skip pod wait in post-apply job when disruptive + - 0.1.39 Allow for unconditional OSD restart ...