From 5a9c8d41b7e3d42ca58546d72454bf050ea476ee Mon Sep 17 00:00:00 2001
From: Renis <renis.makadia@att.com>
Date: Fri, 4 May 2018 11:36:39 -0700
Subject: [PATCH] Docs: Update ceph documentation

- Adding section for Ceph troubleshoot
- Rearrange Testing section to include Ceph

Co-Authored-By: portdirect <pete@port.direct>

Change-Id: Ib04e9b59fea2557cf6cad177dfcc76390c161e06
Signed-off-by: Pete Birley <pete@port.direct>
---
 doc/source/index.rst                          |   2 +-
 doc/source/testing/ceph-resiliency/README.rst |  25 +++
 .../testing/ceph-resiliency/disk-failure.rst  | 171 ++++++++++++++++++
 .../testing/ceph-resiliency/host-failure.rst  |  98 ++++++++++
 doc/source/testing/ceph-resiliency/index.rst  |  12 ++
 .../ceph-resiliency/monitor-failure.rst       | 125 +++++++++++++
 .../testing/ceph-resiliency/osd-failure.rst   | 107 +++++++++++
 .../{testing.rst => testing/helm-tests.rst}   |   8 +-
 doc/source/testing/index.rst                  |   9 +
 doc/source/troubleshooting/ceph.rst           |  59 ++++++
 doc/source/troubleshooting/index.rst          |   1 +
 11 files changed, 610 insertions(+), 7 deletions(-)
 create mode 100644 doc/source/testing/ceph-resiliency/README.rst
 create mode 100644 doc/source/testing/ceph-resiliency/disk-failure.rst
 create mode 100644 doc/source/testing/ceph-resiliency/host-failure.rst
 create mode 100644 doc/source/testing/ceph-resiliency/index.rst
 create mode 100644 doc/source/testing/ceph-resiliency/monitor-failure.rst
 create mode 100644 doc/source/testing/ceph-resiliency/osd-failure.rst
 rename doc/source/{testing.rst => testing/helm-tests.rst} (98%)
 create mode 100644 doc/source/testing/index.rst
 create mode 100644 doc/source/troubleshooting/ceph.rst

diff --git a/doc/source/index.rst b/doc/source/index.rst
index 16d8411094..b27cfb268b 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -17,7 +17,7 @@ Contents:
    install/index
    readme
    specs/index
-   testing
+   testing/index
    troubleshooting/index
 
 Indices and Tables
diff --git a/doc/source/testing/ceph-resiliency/README.rst b/doc/source/testing/ceph-resiliency/README.rst
new file mode 100644
index 0000000000..ab0a931285
--- /dev/null
+++ b/doc/source/testing/ceph-resiliency/README.rst
@@ -0,0 +1,25 @@
+========================================
+Resiliency Tests for OpenStack-Helm/Ceph
+========================================
+
+Mission
+=======
+
+The goal of our resiliency tests for `OpenStack-Helm/Ceph
+<https://github.com/openstack/openstack-helm/tree/master/ceph>`_ is to
+show symptoms of software/hardware failure and provide the solutions.
+
+Caveats:
+   - Our focus lies on resiliency for various failure scenarios but
+     not on performance or stress testing.
+
+Software Failure
+================
+* `Monitor failure <./monitor-failure.html>`_
+* `OSD failure <./osd-failure.html>`_
+
+Hardware Failure
+================
+* `Disk failure <./disk-failure.html>`_
+* `Host failure <./host-failure.html>`_
+
diff --git a/doc/source/testing/ceph-resiliency/disk-failure.rst b/doc/source/testing/ceph-resiliency/disk-failure.rst
new file mode 100644
index 0000000000..dbb7524ad7
--- /dev/null
+++ b/doc/source/testing/ceph-resiliency/disk-failure.rst
@@ -0,0 +1,171 @@
+============
+Disk Failure
+============
+
+Test Environment
+================
+
+- Cluster size: 4 host machines
+- Number of disks: 24 (= 6 disks per host * 4 hosts)
+- Kubernetes version: 1.10.5
+- Ceph version: 12.2.3
+- OpenStack-Helm commit: 25e50a34c66d5db7604746f4d2e12acbdd6c1459
+
+Case: A disk fails
+==================
+
+Symptom:
+--------
+
+This is to test a scenario when a disk failure happens.
+We monitor the ceph status and notice one OSD (osd.2) on voyager4
+which has ``/dev/sdh`` as a backend is down.
+
+.. code-block:: console
+
+  (mon-pod):/# ceph -s
+    cluster:
+      id:     9d4d8c61-cf87-4129-9cef-8fbf301210ad
+      health: HEALTH_WARN
+              too few PGs per OSD (23 < min 30)
+              mon voyager1 is low on available space
+
+    services:
+      mon: 3 daemons, quorum voyager1,voyager2,voyager3
+      mgr: voyager1(active), standbys: voyager3
+      mds: cephfs-1/1/1 up  {0=mds-ceph-mds-65bb45dffc-cslr6=up:active}, 1 up:standby
+      osd: 24 osds: 23 up, 23 in
+      rgw: 2 daemons active
+
+    data:
+      pools:   18 pools, 182 pgs
+      objects: 240 objects, 3359 bytes
+      usage:   2548 MB used, 42814 GB / 42816 GB avail
+      pgs:     182 active+clean
+
+.. code-block:: console
+
+  (mon-pod):/# ceph osd tree
+  ID CLASS WEIGHT   TYPE NAME         STATUS REWEIGHT PRI-AFF
+  -1       43.67981 root default
+  -9       10.91995     host voyager1
+   5   hdd  1.81999         osd.5         up  1.00000 1.00000
+   6   hdd  1.81999         osd.6         up  1.00000 1.00000
+  10   hdd  1.81999         osd.10        up  1.00000 1.00000
+  17   hdd  1.81999         osd.17        up  1.00000 1.00000
+  19   hdd  1.81999         osd.19        up  1.00000 1.00000
+  21   hdd  1.81999         osd.21        up  1.00000 1.00000
+  -3       10.91995     host voyager2
+   1   hdd  1.81999         osd.1         up  1.00000 1.00000
+   4   hdd  1.81999         osd.4         up  1.00000 1.00000
+  11   hdd  1.81999         osd.11        up  1.00000 1.00000
+  13   hdd  1.81999         osd.13        up  1.00000 1.00000
+  16   hdd  1.81999         osd.16        up  1.00000 1.00000
+  18   hdd  1.81999         osd.18        up  1.00000 1.00000
+  -2       10.91995     host voyager3
+   0   hdd  1.81999         osd.0         up  1.00000 1.00000
+   3   hdd  1.81999         osd.3         up  1.00000 1.00000
+  12   hdd  1.81999         osd.12        up  1.00000 1.00000
+  20   hdd  1.81999         osd.20        up  1.00000 1.00000
+  22   hdd  1.81999         osd.22        up  1.00000 1.00000
+  23   hdd  1.81999         osd.23        up  1.00000 1.00000
+  -4       10.91995     host voyager4
+   2   hdd  1.81999         osd.2       down        0 1.00000
+   7   hdd  1.81999         osd.7         up  1.00000 1.00000
+   8   hdd  1.81999         osd.8         up  1.00000 1.00000
+   9   hdd  1.81999         osd.9         up  1.00000 1.00000
+  14   hdd  1.81999         osd.14        up  1.00000 1.00000
+  15   hdd  1.81999         osd.15        up  1.00000 1.00000
+
+
+Solution:
+---------
+
+To replace the failed OSD, excecute the following procedure:
+
+1. From the Kubernetes cluster, remove the failed OSD pod, which is running on ``voyager4``:
+
+.. code-block:: console
+
+  $ kubectl label nodes --all ceph_maintenance_window=inactive
+  $ kubectl label nodes voyager4 --overwrite ceph_maintenance_window=active
+  $ kubectl patch -n ceph ds ceph-osd-default-64779b8c -p='{"spec":{"template":{"spec":{"nodeSelector":{"ceph-osd":"enabled","ceph_maintenance_window":"inactive"}}}}}'
+
+Note: To find the daemonset associated with a failed OSD, check out the followings:
+
+.. code-block:: console
+
+  (voyager4)$ ps -ef|grep /usr/bin/ceph-osd
+  (voyager1)$ kubectl get ds -n ceph
+  (voyager1)$ kubectl get ds <daemonset-name> -n ceph -o yaml
+
+
+3. Remove the failed OSD (OSD ID = 2 in this example) from the Ceph cluster:
+
+.. code-block:: console
+
+  (mon-pod):/# ceph osd lost 2
+  (mon-pod):/# ceph osd crush remove osd.2
+  (mon-pod):/# ceph auth del osd.2
+  (mon-pod):/# ceph osd rm 2
+
+4. Find that Ceph is healthy with a lost OSD (i.e., a total of 23 OSDs):
+
+.. code-block:: console
+
+  (mon-pod):/# ceph -s
+    cluster:
+      id:     9d4d8c61-cf87-4129-9cef-8fbf301210ad
+      health: HEALTH_WARN
+              too few PGs per OSD (23 < min 30)
+              mon voyager1 is low on available space
+
+    services:
+      mon: 3 daemons, quorum voyager1,voyager2,voyager3
+      mgr: voyager1(active), standbys: voyager3
+      mds: cephfs-1/1/1 up  {0=mds-ceph-mds-65bb45dffc-cslr6=up:active}, 1 up:standby
+      osd: 23 osds: 23 up, 23 in
+      rgw: 2 daemons active
+
+    data:
+      pools:   18 pools, 182 pgs
+      objects: 240 objects, 3359 bytes
+      usage:   2551 MB used, 42814 GB / 42816 GB avail
+      pgs:     182 active+clean
+
+5. Replace the failed disk with a new one. If you repair (not replace) the failed disk,
+you may need to run the following:
+
+.. code-block:: console
+
+  (voyager4)$ parted /dev/sdh mklabel msdos
+
+6. Start a new OSD pod on ``voyager4``:
+
+.. code-block:: console
+
+  $ kubectl label nodes voyager4 --overwrite ceph_maintenance_window=inactive
+
+7. Validate the Ceph status (i.e., one OSD is added, so the total number of OSDs becomes 24):
+
+.. code-block:: console
+
+  (mon-pod):/# ceph -s
+    cluster:
+      id:     9d4d8c61-cf87-4129-9cef-8fbf301210ad
+      health: HEALTH_WARN
+              too few PGs per OSD (22 < min 30)
+              mon voyager1 is low on available space
+
+    services:
+      mon: 3 daemons, quorum voyager1,voyager2,voyager3
+      mgr: voyager1(active), standbys: voyager3
+      mds: cephfs-1/1/1 up  {0=mds-ceph-mds-65bb45dffc-cslr6=up:active}, 1 up:standby
+      osd: 24 osds: 24 up, 24 in
+      rgw: 2 daemons active
+
+    data:
+      pools:   18 pools, 182 pgs
+      objects: 240 objects, 3359 bytes
+      usage:   2665 MB used, 44675 GB / 44678 GB avail
+      pgs:     182 active+clean
diff --git a/doc/source/testing/ceph-resiliency/host-failure.rst b/doc/source/testing/ceph-resiliency/host-failure.rst
new file mode 100644
index 0000000000..1194a0b0d0
--- /dev/null
+++ b/doc/source/testing/ceph-resiliency/host-failure.rst
@@ -0,0 +1,98 @@
+============
+Host Failure
+============
+
+Test Environment
+================
+
+- Cluster size: 4 host machines
+- Number of disks: 24 (= 6 disks per host * 4 hosts)
+- Kubernetes version: 1.10.5
+- Ceph version: 12.2.3
+- OpenStack-Helm commit: 25e50a34c66d5db7604746f4d2e12acbdd6c1459
+
+Case: One host machine where ceph-mon is running is rebooted
+============================================================
+
+Symptom:
+--------
+
+After reboot (node voyager3), the node status changes to ``NotReady``.
+
+.. code-block:: console
+
+  $ kubectl get nodes
+  NAME       STATUS     ROLES     AGE       VERSION
+  voyager1   Ready      master    6d        v1.10.5
+  voyager2   Ready      <none>    6d        v1.10.5
+  voyager3   NotReady   <none>    6d        v1.10.5
+  voyager4   Ready      <none>    6d        v1.10.5
+
+Ceph status shows that ceph-mon running on ``voyager3`` becomes out of quorum.
+Also, six osds running on ``voyager3`` are down; i.e., 18 osds are up out of 24 osds.
+
+.. code-block:: console
+
+  (mon-pod):/# ceph -s
+    cluster:
+      id:     9d4d8c61-cf87-4129-9cef-8fbf301210ad
+      health: HEALTH_WARN
+              6 osds down
+              1 host (6 osds) down
+              Degraded data redundancy: 195/624 objects degraded (31.250%), 8 pgs degraded
+              too few PGs per OSD (17 < min 30)
+              mon voyager1 is low on available space
+              1/3 mons down, quorum voyager1,voyager2
+
+    services:
+      mon: 3 daemons, quorum voyager1,voyager2, out of quorum: voyager3
+      mgr: voyager1(active), standbys: voyager3
+      mds: cephfs-1/1/1 up  {0=mds-ceph-mds-65bb45dffc-cslr6=up:active}, 1 up:standby
+      osd: 24 osds: 18 up, 24 in
+      rgw: 2 daemons active
+
+    data:
+      pools:   18 pools, 182 pgs
+      objects: 208 objects, 3359 bytes
+      usage:   2630 MB used, 44675 GB / 44678 GB avail
+      pgs:     195/624 objects degraded (31.250%)
+               126 active+undersized
+               48  active+clean
+               8   active+undersized+degraded
+
+Recovery:
+---------
+The node status of ``voyager3`` changes to ``Ready`` after the node is up again.
+Also, Ceph pods are restarted automatically.
+Ceph status shows that the monitor running on ``voyager3`` is now in quorum.
+
+.. code-block:: console
+
+  $ kubectl get nodes
+  NAME       STATUS    ROLES     AGE       VERSION
+  voyager1   Ready     master    6d        v1.10.5
+  voyager2   Ready     <none>    6d        v1.10.5
+  voyager3   Ready     <none>    6d        v1.10.5
+  voyager4   Ready     <none>    6d        v1.10.5
+
+.. code-block:: console
+
+  (mon-pod):/# ceph -s
+    cluster:
+      id:     9d4d8c61-cf87-4129-9cef-8fbf301210ad
+      health: HEALTH_WARN
+              too few PGs per OSD (22 < min 30)
+              mon voyager1 is low on available space
+
+    services:
+      mon: 3 daemons, quorum voyager1,voyager2,voyager3
+      mgr: voyager1(active), standbys: voyager3
+      mds: cephfs-1/1/1 up  {0=mds-ceph-mds-65bb45dffc-cslr6=up:active}, 1 up:standby
+      osd: 24 osds: 24 up, 24 in
+      rgw: 2 daemons active
+
+    data:
+      pools:   18 pools, 182 pgs
+      objects: 208 objects, 3359 bytes
+      usage:   2635 MB used, 44675 GB / 44678 GB avail
+      pgs:     182 active+clean
diff --git a/doc/source/testing/ceph-resiliency/index.rst b/doc/source/testing/ceph-resiliency/index.rst
new file mode 100644
index 0000000000..f1de98cdf2
--- /dev/null
+++ b/doc/source/testing/ceph-resiliency/index.rst
@@ -0,0 +1,12 @@
+===============
+Ceph Resiliency
+===============
+
+.. toctree::
+   :maxdepth: 2
+
+   README
+   monitor-failure
+   osd-failure
+   disk-failure
+   host-failure
diff --git a/doc/source/testing/ceph-resiliency/monitor-failure.rst b/doc/source/testing/ceph-resiliency/monitor-failure.rst
new file mode 100644
index 0000000000..3c88370369
--- /dev/null
+++ b/doc/source/testing/ceph-resiliency/monitor-failure.rst
@@ -0,0 +1,125 @@
+===============
+Monitor Failure
+===============
+
+Test Environment
+================
+
+- Cluster size: 4 host machines
+- Number of disks: 24 (= 6 disks per host * 4 hosts)
+- Kubernetes version: 1.9.3
+- Ceph version: 12.2.3
+- OpenStack-Helm commit: 28734352741bae228a4ea4f40bcacc33764221eb
+
+We have 3 Monitors in this Ceph cluster, one on each of the 3 Monitor
+hosts.
+
+Case: 1 out of 3 Monitor Processes is Down
+==========================================
+
+This is to test a scenario when 1 out of 3 Monitor processes is down.
+
+To bring down 1 Monitor process (out of 3), we identify a Monitor
+process and kill it from the monitor host (not a pod).
+
+.. code-block:: console
+
+  $ ps -ef | grep ceph-mon
+  ceph     16112 16095  1 14:58 ?        00:00:03 /usr/bin/ceph-mon --cluster ceph --setuser ceph --setgroup ceph -d -i voyager2 --mon-data /var/lib/ceph/mon/ceph-voyager2 --public-addr 135.207.240.42:6789
+  $ sudo kill -9 16112
+
+In the mean time, we monitored the status of Ceph and noted that it
+takes about 24 seconds for the killed Monitor process to recover from
+``down`` to ``up``. The reason is that Kubernetes automatically
+restarts pods whenever they are killed.
+
+.. code-block:: console
+
+  (mon-pod):/# ceph -s
+    cluster:
+      id:     fd366aef-b356-4fe7-9ca5-1c313fe2e324
+      health: HEALTH_WARN
+              mon voyager1 is low on available space
+              1/3 mons down, quorum voyager1,voyager3
+
+    services:
+      mon: 3 daemons, quorum voyager1,voyager3, out of quorum: voyager2
+      mgr: voyager4(active)
+      osd: 24 osds: 24 up, 24 in
+
+.. code-block:: console
+
+  (mon-pod):/# ceph -s
+    cluster:
+      id:     fd366aef-b356-4fe7-9ca5-1c313fe2e324
+      health: HEALTH_WARN
+              mon voyager1 is low on available space
+              1/3 mons down, quorum voyager1,voyager2
+
+    services:
+      mon: 3 daemons, quorum voyager1,voyager2,voyager3
+      mgr: voyager4(active)
+      osd: 24 osds: 24 up, 24 in
+
+We also monitored the status of the Monitor pod through ``kubectl get
+pods -n ceph``, and the status of the pod (where a Monitor process is
+killed) changed as follows: ``Running`` -> ``Error`` -> ``Running``
+and this recovery process takes about 24 seconds.
+
+Case: 2 out of 3 Monitor Processes are Down
+===========================================
+
+This is to test a scenario when 2 out of 3 Monitor processes are down.
+To bring down 2 Monitor processes (out of 3), we identify two Monitor
+processes and kill them from the 2 monitor hosts (not a pod).
+
+We monitored the status of Ceph when the Monitor processes are killed
+and noted that the symptoms are similar to when 1 Monitor process is
+killed:
+
+- It takes longer (about 1 minute) for the killed Monitor processes to
+  recover from ``down`` to ``up``.
+
+- The status of the pods (where the two Monitor processes are killed)
+  changed as follows: ``Running`` -> ``Error`` -> ``CrashLoopBackOff``
+  -> ``Running`` and this recovery process takes about 1 minute.
+
+
+Case: 3 out of 3 Monitor Processes are Down
+===========================================
+
+This is to test a scenario when 3 out of 3 Monitor processes are down.
+To bring down 3 Monitor processes (out of 3), we identify all 3
+Monitor processes and kill them from the 3 monitor hosts (not pods).
+
+We monitored the status of Ceph Monitor pods and noted that the
+symptoms are similar to when 1 or 2 Monitor processes are killed:
+
+.. code-block:: console
+
+  $ kubectl get pods -n ceph -o wide | grep ceph-mon
+  NAME                                       READY     STATUS    RESTARTS   AGE
+  ceph-mon-8tml7                             0/1       Error     4          10d
+  ceph-mon-kstf8                             0/1       Error     4          10d
+  ceph-mon-z4sl9                             0/1       Error     7          10d
+
+.. code-block:: console
+
+  $ kubectl get pods -n ceph -o wide | grep ceph-mon
+  NAME                                       READY     STATUS               RESTARTS   AGE
+  ceph-mon-8tml7                             0/1       CrashLoopBackOff     4          10d
+  ceph-mon-kstf8                             0/1       Error                4          10d
+  ceph-mon-z4sl9                             0/1       CrashLoopBackOff     7          10d
+
+
+.. code-block:: console
+
+  $ kubectl get pods -n ceph -o wide | grep ceph-mon
+  NAME                                       READY     STATUS    RESTARTS   AGE
+  ceph-mon-8tml7                             1/1       Running   5          10d
+  ceph-mon-kstf8                             1/1       Running   5          10d
+  ceph-mon-z4sl9                             1/1       Running   8          10d
+
+The status of the pods (where the three Monitor processes are killed)
+changed as follows: ``Running`` -> ``Error`` -> ``CrashLoopBackOff``
+-> ``Running`` and this recovery process takes about 1 minute.
diff --git a/doc/source/testing/ceph-resiliency/osd-failure.rst b/doc/source/testing/ceph-resiliency/osd-failure.rst
new file mode 100644
index 0000000000..6578a91864
--- /dev/null
+++ b/doc/source/testing/ceph-resiliency/osd-failure.rst
@@ -0,0 +1,107 @@
+===========
+OSD Failure
+===========
+
+Test Environment
+================
+
+- Cluster size: 4 host machines
+- Number of disks: 24 (= 6 disks per host * 4 hosts)
+- Kubernetes version: 1.9.3
+- Ceph version: 12.2.3
+- OpenStack-Helm commit: 28734352741bae228a4ea4f40bcacc33764221eb
+
+Case: OSD processes are killed
+==============================
+
+This is to test a scenario when some of the OSDs are down.
+
+To bring down 6 OSDs (out of 24), we identify the OSD processes and
+kill them from a storage host (not a pod).
+
+.. code-block:: console
+
+  $ ps -ef|grep /usr/bin/ceph-osd
+  ceph     44587 43680  1 18:12 ?        00:00:01 /usr/bin/ceph-osd --cluster ceph --osd-journal /dev/sdb5 -f -i 4 --setuser ceph --setgroup disk
+  ceph     44627 43744  1 18:12 ?        00:00:01 /usr/bin/ceph-osd --cluster ceph --osd-journal /dev/sdb2 -f -i 6 --setuser ceph --setgroup disk
+  ceph     44720 43927  2 18:12 ?        00:00:01 /usr/bin/ceph-osd --cluster ceph --osd-journal /dev/sdb6 -f -i 3 --setuser ceph --setgroup disk
+  ceph     44735 43868  1 18:12 ?        00:00:01 /usr/bin/ceph-osd --cluster ceph --osd-journal /dev/sdb1 -f -i 9 --setuser ceph --setgroup disk
+  ceph     44806 43855  1 18:12 ?        00:00:01 /usr/bin/ceph-osd --cluster ceph --osd-journal /dev/sdb4 -f -i 0 --setuser ceph --setgroup disk
+  ceph     44896 44011  2 18:12 ?        00:00:01 /usr/bin/ceph-osd --cluster ceph --osd-journal /dev/sdb3 -f -i 1 --setuser ceph --setgroup disk
+  root     46144 45998  0 18:13 pts/10   00:00:00 grep --color=auto /usr/bin/ceph-osd
+
+  $ sudo kill -9 44587 44627 44720 44735 44806 44896
+
+.. code-block:: console
+
+  (mon-pod):/# ceph -s
+    cluster:
+      id:     fd366aef-b356-4fe7-9ca5-1c313fe2e324
+      health: HEALTH_WARN
+              6 osds down
+              1 host (6 osds) down
+              Reduced data availability: 8 pgs inactive, 58 pgs peering
+              Degraded data redundancy: 141/1002 objects degraded (14.072%), 133 pgs degraded
+              mon voyager1 is low on available space
+
+    services:
+      mon: 3 daemons, quorum voyager1,voyager2,voyager3
+      mgr: voyager4(active)
+      osd: 24 osds: 18 up, 24 in
+
+In the mean time, we monitor the status of Ceph and noted that it takes about 30 seconds for the 6 OSDs to recover from ``down`` to ``up``.
+The reason is that Kubernetes automatically restarts OSD pods whenever they are killed.
+
+.. code-block:: console
+
+  (mon-pod):/# ceph -s
+    cluster:
+      id:     fd366aef-b356-4fe7-9ca5-1c313fe2e324
+      health: HEALTH_WARN
+              mon voyager1 is low on available space
+
+    services:
+      mon: 3 daemons, quorum voyager1,voyager2,voyager3
+      mgr: voyager4(active)
+      osd: 24 osds: 24 up, 24 in
+
+Case: A OSD pod is deleted
+==========================
+
+This is to test a scenario when an OSD pod is deleted by ``kubectl delete $OSD_POD_NAME``.
+Meanwhile, we monitor the status of Ceph and noted that it takes about 90 seconds for the OSD running in deleted pod to recover from ``down`` to ``up``.
+
+.. code-block:: console
+
+  root@voyager3:/# ceph -s
+    cluster:
+      id:     fd366aef-b356-4fe7-9ca5-1c313fe2e324
+      health: HEALTH_WARN
+              1 osds down
+              Degraded data redundancy: 43/945 objects degraded (4.550%), 35 pgs degraded, 109 pgs undersized
+              mon voyager1 is low on available space
+
+    services:
+      mon: 3 daemons, quorum voyager1,voyager2,voyager3
+      mgr: voyager4(active)
+      osd: 24 osds: 23 up, 24 in
+
+.. code-block:: console
+
+  (mon-pod):/# ceph -s
+    cluster:
+      id:     fd366aef-b356-4fe7-9ca5-1c313fe2e324
+      health: HEALTH_WARN
+              mon voyager1 is low on available space
+
+    services:
+      mon: 3 daemons, quorum voyager1,voyager2,voyager3
+      mgr: voyager4(active)
+      osd: 24 osds: 24 up, 24 in
+
+We also monitored the pod status through ``kubectl get pods -n ceph``
+during this process. The deleted OSD pod status changed as follows:
+``Terminating`` -> ``Init:1/3`` -> ``Init:2/3`` -> ``Init:3/3`` ->
+``Running``, and this process taks about 90 seconds. The reason is
+that Kubernetes automatically restarts OSD pods whenever they are
+deleted.
diff --git a/doc/source/testing.rst b/doc/source/testing/helm-tests.rst
similarity index 98%
rename from doc/source/testing.rst
rename to doc/source/testing/helm-tests.rst
index 5c6f1ce080..aeba7d6ef0 100644
--- a/doc/source/testing.rst
+++ b/doc/source/testing/helm-tests.rst
@@ -1,9 +1,6 @@
-=======
-Testing
-=======
-
+==========
 Helm Tests
-----------
+==========
 
 Every OpenStack-Helm chart should include any required Helm tests necessary to
 provide a sanity check for the OpenStack service.  Information on using the Helm
@@ -27,7 +24,6 @@ chart.  If Rally tests are not appropriate or adequate for a service chart, any
 additional tests should be documented appropriately and adhere to the same
 expectations.
 
-
 Running Tests
 -------------
 
diff --git a/doc/source/testing/index.rst b/doc/source/testing/index.rst
new file mode 100644
index 0000000000..db9f155726
--- /dev/null
+++ b/doc/source/testing/index.rst
@@ -0,0 +1,9 @@
+=======
+Testing
+=======
+
+.. toctree::
+   :maxdepth: 2
+
+   helm-tests
+   ceph-resiliency/index
diff --git a/doc/source/troubleshooting/ceph.rst b/doc/source/troubleshooting/ceph.rst
new file mode 100644
index 0000000000..db53b9d441
--- /dev/null
+++ b/doc/source/troubleshooting/ceph.rst
@@ -0,0 +1,59 @@
+Backing up a PVC
+^^^^^^^^^^^^^^^^
+
+Backing up a PVC stored in Ceph, is fairly straigthforward, in this example we
+use the PVC ``mysql-data-mariadb-server-0`` as an example, but this will also
+apply to any other services using PVCs eg. RabbitMQ, Postgres.
+
+
+.. code-block:: shell
+
+    #  get all required details
+    NS_NAME="openstack"
+    PVC_NAME="mysql-data-mariadb-server-0"
+    # you can check this by running  kubectl get pvc -n ${NS_NAME}
+
+    PV_NAME="$(kubectl get -n ${NS_NAME} pvc "${PVC_NAME}" --no-headers | awk '{ print $3 }')"
+    RBD_NAME="$(kubectl get pv "${PV_NAME}" -o json | jq -r '.spec.rbd.image')"
+    MON_POD=$(kubectl get pods \
+      --namespace=ceph \
+      --selector="application=ceph" \
+      --selector="component=mon" \
+      --no-headers | awk '{ print $1; exit }')
+
+    # copy admin keyring from ceph mon to host node
+
+    kubectl exec -it ${MON_POD} -n ceph -- cat /etc/ceph/ceph.client.admin.keyring > /etc/ceph/ceph.client.admin.keyring
+    sudo kubectl get cm -n ceph ceph-etc -o json|jq -j  .data[] > /etc/ceph/ceph.conf
+
+    export CEPH_MON_NAME="ceph-mon-discovery.ceph.svc.cluster.local"
+
+    # create snapshot and  export to a file
+
+    rbd snap create rbd/${RBD_NAME}@snap1 -m ${CEPH_MON_NAME}
+    rbd snap list rbd/${RBD_NAME} -m ${CEPH_MON_NAME}
+
+    # Export the snapshot and compress , make sure we have enough space on host to accomidate big files that we are working .
+
+    # a. if we have enough space on host
+
+    rbd export rbd/${RBD_NAME}@snap1 /backup/${RBD_NAME}.img -m ${CEPH_MON_NAME}
+    cd /backup
+    time xz -0vk --threads=0  /backup/${RBD_NAME}.img
+
+    # b. if we have less space on host we can directly export  and compress in single command
+
+    rbd export rbd/${RBD_NAME}@snap1 -m ${CEPH_MON_NAME} - | xz  -0v --threads=0 >  /backup/${RBD_NAME}.img.xz
+
+
+Restoring is just as straightforward. Once the workload consuming the device has
+been stopped, and the raw RBD device removed the following will import the
+back up and create a device:
+
+.. code-block:: shell
+
+    cd /backup
+    unxz -k ${RBD_NAME}.img.xz
+    rbd import /backup/${RBD_NAME}.img rbd/${RBD_NAME} -m ${CEPH_MON_NAME}
+
+Once this has been done the workload can be restarted.
diff --git a/doc/source/troubleshooting/index.rst b/doc/source/troubleshooting/index.rst
index 7558079846..621f85e3f8 100644
--- a/doc/source/troubleshooting/index.rst
+++ b/doc/source/troubleshooting/index.rst
@@ -11,6 +11,7 @@ Sometimes things go wrong. These guides will help you solve many common issues w
    persistent-storage
    proxy
    ubuntu-hwe-kernel
+   ceph
 
 Getting help
 ============