[ceph-osd] Add a disruptive OSD restart to the post-apply job

Currently the ceph-osd post-apply job always restarts OSDs without disruption. This requires waiting for a healthy cluster state in betweeen failure domain restarts, which isn't possible in some upgrade scenarios. In those scenarios where disruption is acceptable and a simultaneous restart of all OSDs is required, the disruptive_osd_restart value now provides this option. Change-Id: I64bfc30382e86c22b0f577d85fceef0d5c106d94
2022-03-30 15:01:12 -06:00 · 2022-03-30 15:01:12 -06:00 · 2fa26b2821
commit 2fa26b2821
parent 95a754a2c4
5 changed files with 27 additions and 9 deletions
--- a/ceph-osd/Chart.yaml
+++ b/ceph-osd/Chart.yaml
@ -15,6 +15,6 @@ apiVersion: v1
 appVersion: v1.0.0
 description: OpenStack-Helm Ceph OSD
 name: ceph-osd
-version: 0.1.36
+version: 0.1.37
 home: https://github.com/ceph/ceph
 ...
--- a/ceph-osd/templates/bin/_post-apply.sh.tpl
+++ b/ceph-osd/templates/bin/_post-apply.sh.tpl
@ -188,14 +188,24 @@ echo "Latest revision of the helm chart(s) is : $max_release"

 if [[ $max_release -gt 1  ]]; then
  if [[  $require_upgrade -gt 0 ]]; then
-    echo "waiting for inactive pgs and degraded objects before upgrade"
-    wait_for_pgs
-    wait_for_degraded_and_misplaced_objects
-    ceph -s
-    ceph osd "set" noout
-    echo "lets restart the osds rack by rack"
-    restart_by_rack
-    ceph osd "unset" noout
+    if [[ "$DISRUPTIVE_OSD_RESTART" == "true" ]]; then
+      echo "restarting all osds simultaneously"
+      kubectl -n $CEPH_NAMESPACE delete pod -l component=osd
+      sleep 60
+      echo "waiting for pgs to become active and for degraded objects to recover"
+      wait_for_pgs
+      wait_for_degraded_objects
+      ceph -s
+    else
+      echo "waiting for inactive pgs and degraded objects before upgrade"
+      wait_for_pgs
+      wait_for_degraded_and_misplaced_objects
+      ceph -s
+      ceph osd "set" noout
+      echo "lets restart the osds rack by rack"
+      restart_by_rack
+      ceph osd "unset" noout
+    fi
  fi

  #lets check all the ceph-osd daemonsets
--- a/ceph-osd/templates/job-post-apply.yaml
+++ b/ceph-osd/templates/job-post-apply.yaml
@ -102,6 +102,8 @@ spec:
              value: {{ .Release.Name }}
            - name: REQUIRED_PERCENT_OF_OSDS
              value: {{ .Values.conf.ceph.target.required_percent_of_osds | ceil | quote }}
+            - name: DISRUPTIVE_OSD_RESTART
+              value: {{ .Values.conf.storage.disruptive_osd_restart | quote }}
          command:
            - /tmp/post-apply.sh
          volumeMounts:
--- a/ceph-osd/values.yaml
+++ b/ceph-osd/values.yaml
@ -288,6 +288,11 @@ conf:
    #     type: directory
    #     location: /var/lib/openstack-helm/ceph/osd/journal-one

+    # The post-apply job will restart OSDs without disruption by default. Set
+    # this value to "true" to restart all OSDs at once. This will accomplish
+    # OSD restarts more quickly with disruption.
+    disruptive_osd_restart: "false"
+
 # NOTE(portdirect): for heterogeneous clusters the overrides section can be used to define
 # OSD pods that will be deployed upon specifc nodes.
 # overrides:
--- a/releasenotes/notes/ceph-osd.yaml
+++ b/releasenotes/notes/ceph-osd.yaml
@ -37,4 +37,5 @@ ceph-osd:
  - 0.1.34 Remove wait for misplaced objects during OSD restarts
  - 0.1.35 Consolidate mon_endpoints discovery
  - 0.1.36 Add OSD device location pre-check
+  - 0.1.37 Add a disruptive OSD restart to the post-apply job
 ...