diff --git a/ceph-mon/templates/bin/utils/_checkPGs.py.tpl b/ceph-mon/templates/bin/utils/_checkPGs.py.tpl new file mode 100755 index 000000000..1f05bcae6 --- /dev/null +++ b/ceph-mon/templates/bin/utils/_checkPGs.py.tpl @@ -0,0 +1,256 @@ +#!/usr/bin/python2 + +import subprocess +import json +import sys +from argparse import * + +class cephCRUSH(): + """ + Currently, this script is coded to work with the ceph clusters that have + these type-ids -- osd, host, rack, root. To add other type_ids to the + CRUSH map, this script needs enhancements to include the new type_ids. + + type_id name + ------- ---- + 0 osd + 1 host + 2 chassis + 3 rack + 4 row + 5 pdu + 6 pod + 7 room + 8 datacenter + 9 region + 10 root + + Ceph organizes the CRUSH map in hierarchical topology. At the top, it is + the root. The next levels are racks, hosts, and OSDs, respectively. The + OSDs are at the leaf level. This script looks at OSDs in each placement + group of a ceph pool. For each OSD, starting from the OSD leaf level, this + script traverses up to the root. Along the way, the host and rack are + recorded and then verified to make sure the paths to the root are in + separate failure domains. This script reports the offending PGs to stdout. + """ + + """ + This list stores the ceph crush hierarchy retrieved from the + ceph osd crush tree -f json-pretty + """ + crushHierarchy = [] + + """ + Failure Domains - currently our crush map uses these type IDs - osd, + host, rack, root + If we need to add chassis type (or other types) later on, add the + type to the if statement in the crushFD construction section. + + crushFD[0] = {'id': -2, 'name': 'host1', 'type': 'host'} + crushFD[23] = {'id': -5, 'name': 'host2', 'type': 'host'} + crushFD[68] = {'id': -7, 'name': 'host3', 'type': 'host'} + rack_FD[-2] = {'id': -9, 'name': 'rack1', 'type': 'rack' } + rack_FD[-15] = {'id': -17, 'name': 'rack2', 'type': 'rack' } + root_FD[-17] = {'id': -1, 'name': 'default', 'type': 'root' }} + root_FD[-9] = {'id': -1, 'name': 'default', 'type': 'root' }} + """ + crushFD = {} + + def __init__(self, poolName): + if 'all' in poolName or 'All' in poolName: + try: + poolLs = 'ceph osd pool ls -f json-pretty' + poolstr = subprocess.check_output(poolLs, shell=True) + self.listPoolName = json.loads(poolstr) + except subprocess.CalledProcessError as e: + print('{}'.format(e)) + """Unable to get all pools - cannot proceed""" + sys.exit(2) + else: + self.listPoolName = poolName + + try: + """Retrieve the crush hierarchies""" + crushTree = "ceph osd crush tree -f json-pretty | grep -v '^\[\]'" + chstr = subprocess.check_output(crushTree, shell=True) + self.crushHierarchy = json.loads(chstr) + except subprocess.CalledProcessError as e: + print('{}'.format(e)) + """Unable to get crush hierarchy - cannot proceed""" + sys.exit(2) + + """ + Number of racks configured in the ceph cluster. The racks that are + present in the crush hierarchy may not be used. The un-used rack + would not show up in the crushFD. + """ + self.count_racks = 0 + + """depth level - 3 is OSD, 2 is host, 1 is rack, 0 is root""" + self.osd_depth = 0 + """Construct the Failure Domains - OSD -> Host -> Rack -> Root""" + for chitem in self.crushHierarchy: + if chitem['type'] == 'host' or \ + chitem['type'] == 'rack' or \ + chitem['type'] == 'root': + for child in chitem['children']: + self.crushFD[child] = {'id': chitem['id'], 'name': chitem['name'], 'type': chitem['type']} + if chitem['type'] == 'rack' and len(chitem['children']) > 0: + self.count_racks += 1 + elif chitem['type'] == 'osd': + if self.osd_depth == 0: + self.osd_depth = chitem['depth'] + + """[ { 'pg-name' : [osd.1, osd.2, osd.3] } ... ]""" + self.poolPGs = [] + """Replica of the pool. Initialize to 0.""" + self.poolSize = 0 + + def getPoolSize(self, poolName): + """ + size (number of replica) is an attribute of a pool + { "pool": "rbd", "pool_id": 1, "size": 3 } + """ + pSize = {} + """Get the size attribute of the poolName""" + try: + poolGet = 'ceph osd pool get ' + poolName + ' size -f json-pretty' + szstr = subprocess.check_output(poolGet, shell=True) + pSize = json.loads(szstr) + self.poolSize = pSize['size'] + except subprocess.CalledProcessError as e: + print('{}'.format(e)) + self.poolSize = 0 + """Continue on""" + return + + def checkPGs(self, poolName): + if not len(self.poolPGs) > 0: + return + print('Checking PGs in pool {} ...'.format(poolName)), + badPGs = False + for pg in self.poolPGs: + osdUp = pg['up'] + """ + Construct the OSD path from the leaf to the root. If the + replica is set to 3 and there are 3 racks. Each OSD has its + own rack (failure domain). If more than one OSD has the + same rack, this is a violation. If the number of rack is + one, then we need to make sure the hosts for the three OSDs + are different. + """ + check_FD = {} + checkFailed = False + for osd in osdUp: + traverseID = osd + """Start the level with 1 to include the OSD leaf""" + traverseLevel = 1 + while (self.crushFD[traverseID]['type'] != 'root'): + crushType = self.crushFD[traverseID]['type'] + crushName = self.crushFD[traverseID]['name'] + if crushType in check_FD: + check_FD[crushType].append(crushName) + else: + check_FD[crushType] = [crushName] + """traverse up (to the root) one level""" + traverseID = self.crushFD[traverseID]['id'] + traverseLevel += 1 + assert (traverseLevel == self.osd_depth), "OSD depth mismatch" + """ + check_FD should have + { + 'host': ['host1', 'host2', 'host3', 'host4'], + 'rack': ['rack1', 'rack2', 'rack3'] + } + Not checking for the 'root' as there is only one root. + """ + for ktype in check_FD: + kvalue = check_FD[ktype] + if ktype == 'host': + """ + At the host level, every OSD should come from different + host. It is a violation if duplicate hosts are found. + """ + if len(kvalue) != len(set(kvalue)): + if not badPGs: + print('Failed') + badPGs = True + print('OSDs {} in PG {} failed check in host {}'.format(pg['up'], pg['pgid'], kvalue)) + elif ktype == 'rack': + if len(kvalue) == len(set(kvalue)): + continue + else: + """ + There are duplicate racks. This could be due to + situation like pool's size is 3 and there are only + two racks (or one rack). OSDs should come from + different hosts as verified in the 'host' section. + """ + if self.count_racks == len(set(kvalue)): + continue + elif self.count_racks > len(set(kvalue)): + """Not all the racks were used to allocate OSDs""" + if not badPGs: + print('Failed') + badPGs = True + print('OSDs {} in PG {} failed check in rack {}'.format(pg['up'], pg['pgid'], kvalue)) + check_FD.clear() + if not badPGs: + print('Passed') + return + + def checkPoolPGs(self): + for pool in self.listPoolName: + self.getPoolSize(pool) + if self.poolSize == 1: + """No need to check pool with the size set to 1 copy""" + print('Checking PGs in pool {} ... {}'.format(pool, 'Skipped')) + continue + elif self.poolSize == 0: + print('Pool {} was not found.'.format(pool)) + continue + assert (self.poolSize > 1), "Pool size was incorrectly set" + + try: + """Get the list of PGs in the pool""" + lsByPool = 'ceph pg ls-by-pool ' + pool + ' -f json-pretty' + pgstr = subprocess.check_output(lsByPool, shell=True) + self.poolPGs = json.loads(pgstr) + """Check that OSDs in the PG are in separate failure domains""" + self.checkPGs(pool) + except subprocess.CalledProcessError as e: + print('{}'.format(e)) + """Continue to the next pool (if any)""" + return + +def Main(): + parser = ArgumentParser(description=''' +Cross-check the OSDs assigned to the Placement Groups (PGs) of a ceph pool +with the CRUSH topology. The cross-check compares the OSDs in a PG and +verifies the OSDs reside in separate failure domains. PGs with OSDs in +the same failure domain are flagged as violation. The offending PGs are +printed to stdout. + +This CLI is executed on-demand on a ceph-mon pod. To invoke the CLI, you +can specify one pool or list of pools to check. The special pool name +All (or all) checks all the pools in the ceph cluster. +''', + formatter_class=RawTextHelpFormatter) + parser.add_argument('PoolName', type=str, nargs='+', + help='List of pools (or All) to validate the PGs and OSDs mapping') + args = parser.parse_args() + + if ('all' in args.PoolName or + 'All' in args.PoolName) and len(args.PoolName) > 1: + print('You only need to give one pool with special pool All') + sys.exit(1) + + """ + Retrieve the crush hierarchies and store it. Cross-check the OSDs + in each PG searching for failure domain violation. + """ + ccm = cephCRUSH(args.PoolName) + ccm.checkPoolPGs() + +if __name__ == '__main__': + Main() diff --git a/ceph-mon/templates/bin/utils/_checkPGs.sh.tpl b/ceph-mon/templates/bin/utils/_checkPGs.sh.tpl new file mode 100644 index 000000000..3ab82c35b --- /dev/null +++ b/ceph-mon/templates/bin/utils/_checkPGs.sh.tpl @@ -0,0 +1,23 @@ +#!/bin/bash + +{{/* +Copyright 2018 The Openstack-Helm Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} + +set -ex + +monPod=$(kubectl get pods --namespace=${DEPLOYMENT_NAMESPACE} --selector=application=ceph --selector=component=mon --output=jsonpath={.items[0].metadata.name} 2>/dev/null) + +kubectl exec -t ${monPod} --namespace=${DEPLOYMENT_NAMESPACE} -- /tmp/utils-checkPGs.py All 2>/dev/null diff --git a/ceph-mon/templates/configmap-bin.yaml b/ceph-mon/templates/configmap-bin.yaml index e9945bf58..8f9e10f63 100644 --- a/ceph-mon/templates/configmap-bin.yaml +++ b/ceph-mon/templates/configmap-bin.yaml @@ -54,6 +54,12 @@ data: moncheck-reap-zombies.py: | {{ tuple "bin/moncheck/_reap-zombies.py.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} + utils-checkPGs.py: | +{{ tuple "bin/utils/_checkPGs.py.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} + + utils-checkPGs.sh: | +{{ tuple "bin/utils/_checkPGs.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} + {{ if .Values.logging.fluentd }} fluentbit-sidecar.sh: | {{ tuple "bin/mon/_fluentbit-sidecar.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} diff --git a/ceph-mon/templates/cronjob-checkPGs.yaml b/ceph-mon/templates/cronjob-checkPGs.yaml new file mode 100644 index 000000000..6399fcb2f --- /dev/null +++ b/ceph-mon/templates/cronjob-checkPGs.yaml @@ -0,0 +1,52 @@ +{{/* +Copyright 2018 The Openstack-Helm Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} + +{{- if .Values.manifests.cronjob_checkPGs }} +{{- $envAll := . }} + +{{- $serviceAccountName := "ceph-pool-checkpgs" }} +{{ tuple $envAll "pool_checkpgs" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }} +--- +apiVersion: batch/v1beta1 +kind: CronJob +metadata: + name: {{ $serviceAccountName }} +spec: + schedule: {{ .Values.jobs.pool_checkPGs.cron | quote }} + successfulJobsHistoryLimit: {{ .Values.jobs.pool_checkPGs.history.successJob }} + failedJobsHistoryLimit: {{ .Values.jobs.pool_checkPGs.history.failJob }} + concurrencyPolicy: {{ .Values.jobs.pool_checkPGs.concurrency.execPolicy }} + startingDeadlineSeconds: {{ .Values.jobs.pool_checkPGs.startingDeadlineSecs }} + jobTemplate: + metadata: + labels: +{{ tuple $envAll "ceph" "pool-checkpgs" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 8 }} + spec: + template: + spec: + containers: + - name: {{ $serviceAccountName }} +{{ tuple $envAll "ceph_config_helper" | include "helm-toolkit.snippets.image" | indent 12 }} + env: + - name: DEPLOYMENT_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + command: + - /tmp/utils-checkPGs.sh + restartPolicy: Never + +{{- end }} diff --git a/ceph-mon/templates/daemonset-mon.yaml b/ceph-mon/templates/daemonset-mon.yaml index 1b388172a..6bc81a5b2 100644 --- a/ceph-mon/templates/daemonset-mon.yaml +++ b/ceph-mon/templates/daemonset-mon.yaml @@ -156,6 +156,14 @@ spec: mountPath: /tmp/mon-check.sh subPath: mon-check.sh readOnly: true + - name: ceph-mon-bin + mountPath: /tmp/utils-checkPGs.py + subPath: utils-checkPGs.py + readOnly: true + - name: ceph-mon-bin + mountPath: /tmp/utils-checkPGs.sh + subPath: utils-checkPGs.sh + readOnly: true - name: ceph-mon-etc mountPath: /etc/ceph/ceph.conf subPath: ceph.conf diff --git a/ceph-mon/values.yaml b/ceph-mon/values.yaml index 757881864..5ad9b462d 100644 --- a/ceph-mon/values.yaml +++ b/ceph-mon/values.yaml @@ -113,6 +113,20 @@ network: public: 192.168.0.0/16 cluster: 192.168.0.0/16 +jobs: + pool_checkPGs: + # Execute monthly on the 1st at 00:01 AM + cron: "1 0 1 * *" + history: + # Number of successful job to keep + successJob: 1 + # Number of failed job to keep + failJob: 1 + concurrency: + # Skip new job if previous job still active + execPolicy: Forbid + startingDeadlineSecs: 60 + conf: templates: keyring: @@ -319,3 +333,4 @@ manifests: service_mon: true service_mon_discovery: true job_storage_admin_keys: true + cronjob_checkPGs: true diff --git a/doc/source/index.rst b/doc/source/index.rst index 63d378d81..936eb8913 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -7,6 +7,7 @@ Contents: :maxdepth: 2 install/index + testing/index Indices and Tables diff --git a/doc/source/testing/ceph-resiliency/README.rst b/doc/source/testing/ceph-resiliency/README.rst new file mode 100644 index 000000000..6d78dfbba --- /dev/null +++ b/doc/source/testing/ceph-resiliency/README.rst @@ -0,0 +1,21 @@ +============================================== +Resiliency Tests for OpenStack-Helm-Infra/Ceph +============================================== + +Mission +======= + +The goal of our resiliency tests for `OpenStack-Helm-Infra/Ceph +`_ is to +show symptoms of software/hardware failure and provide the solutions. + +Caveats: + - Our focus lies on resiliency for various failure scenarios but + not on performance or stress testing. + +Software Failure +================ +* `CRUSH Failure Domain <./failure-domain.html>`_ + +Hardware Failure +================ diff --git a/doc/source/testing/ceph-resiliency/failure-domain.rst b/doc/source/testing/ceph-resiliency/failure-domain.rst new file mode 100644 index 000000000..7c1873174 --- /dev/null +++ b/doc/source/testing/ceph-resiliency/failure-domain.rst @@ -0,0 +1,1234 @@ +.. -*- coding: utf-8 -*- + +.. NOTE TO MAINTAINERS: use rst2html script to convert .rst to .html + rst2html ./failure-domain.rst ./failure-domain.html + open ./failure-domain.html + +============================== + Failure Domains in CRUSH Map +============================== + +.. contents:: +.. sectnum:: + +Overview +======== + +The `CRUSH Map `__ in a Ceph cluster is best visualized +as an inverted tree. The hierarchical layout describes the physical +topology of the Ceph cluster. Through the physical topology, failure +domains are conceptualized from the different branches in the inverted +tree. CRUSH rules are created and map to failure domains with data +placement policy to distribute the data. + +The internal nodes (non-leaves and non-root) in the hierarchy are identified +as buckets. Each bucket is a hierarchical aggregation of storage locations +and their assigned weights. These are the types defined by CRUSH as the +supported buckets. + +:: + + # types + type 0 osd + type 1 host + type 2 chassis + type 3 rack + type 4 row + type 5 pdu + type 6 pod + type 7 room + type 8 datacenter + type 9 region + type 10 root + +This guide describes the host and rack buckets and their role in constructing +a CRUSH Map with separate failure domains. Once a Ceph cluster is configured +with the expected CRUSh Map and Rule, the PGs of the designated pool are +verified with a script (**utils-checkPGs.py**) to ensure that the OSDs in all the PGs +reside in separate failure domains. + +Ceph Environment +================ + +The ceph commands and scripts described in this write-up are executed as +Linux user root on one of the ceph monitors deployed as kubernetes +pods. The root user has the credential to execute all the ceph commands. + +On a kubernetes cluster, a separate namespace named **ceph** is configured +for the ceph cluster. Include the **ceph** namespace in **kubectl** when +executing this command. + +A kubernetes pod is a collection of docker containers sharing a network +and mount namespace. It is the basic unit of deployment in the kubernetes +cluster. The node in the kubernetes cluster where the orchestration +operations are performed needs access to the **kubectl** command. In this +guide, this node is referred to as the orchestration node. On this +node, you can list all the pods that are deployed. To execute a command +in a given pod, use **kubectl** to locate the name of the pod and switch +to it to execute the command. + +Orchestration Node +------------------ + +To gain access to the kubernetes orchestration node, use your login +credential and the authentication procedure assigned to you. For +environments setup with SSH key-based access, your id_rsa.pub (generated +through the ssh-keygen) public key should be in your ~/.ssh/authorized_keys +file on the orchestration node. + +The kubernetes and ceph commands require the root login credential to +execute. Your Linux login requires the *sudo* privilege to execute +commands as user root. On the orchestration node, acquire the root's +privilege with your Linux login through the *sudo* command. + +:: + + [orchestration]$ sudo -i + : + [orchestration]# + +Kubernetes Pods +--------------- + +On the orchestration node, execute the **kubectl** command to list the +specific set of pods with the **--selector** option. This **kubectl** +command lists all the ceph monitor pods. + +:: + + [orchestration]# kubectl get pods -n ceph --selector component=mon + NAME READY STATUS RESTARTS AGE + ceph-mon-85mlt 2/2 Running 0 9d + ceph-mon-9mpnb 2/2 Running 0 9d + ceph-mon-rzzqr 2/2 Running 0 9d + ceph-mon-snds8 2/2 Running 0 9d + ceph-mon-snzwx 2/2 Running 0 9d + +The following **kubectl** command lists the Ceph OSD pods. + +:: + + [orchestration]# kubectl get pods -n ceph --selector component=osd + NAME READY STATUS RESTARTS AGE + ceph-osd-default-166a1044-95s74 2/2 Running 0 9d + ceph-osd-default-166a1044-bglnm 2/2 Running 0 9d + ceph-osd-default-166a1044-lq5qq 2/2 Running 0 9d + ceph-osd-default-166a1044-lz6x6 2/2 Running 0 9d + . . . + +To list all the pods in all the namespaces, execute this **kubectl** command. + +:: + + [orchestration]# kubectl get pods --all-namespaces + NAMESPACE NAME READY STATUS RESTARTS AGE + ceph ceph-bootstrap-rpzld 0/1 Completed 0 10d + ceph ceph-cephfs-client-key-generator-pvzs6 0/1 Completed 0 10d + ceph ceph-cephfs-provisioner-796668cd7-bn6mn 1/1 Running 0 10d + + +Execute Commands in Pods +^^^^^^^^^^^^^^^^^^^^^^^^ + +To execute multiple commands in a pod, you can switch to the execution +context of the pod with a /bin/bash session. + +:: + + [orchestration]# kubectl exec -it ceph-mon-85mlt -n ceph -- /bin/bash + [ceph-mon]# ceph status + cluster: + id: 07c31d0f-bcc6-4db4-aadf-2d2a0f13edb8 + health: HEALTH_OK + + services: + mon: 5 daemons, quorum host1,host2,host3,host4,host5 + mgr: host6(active), standbys: host1 + mds: cephfs-1/1/1 up {0=mds-ceph-mds-7cb4f57cc-prh87=up:active}, 1 up:standby + osd: 72 osds: 72 up, 72 in + rgw: 2 daemons active + + data: + pools: 20 pools, 3944 pgs + objects: 86970 objects, 323 GB + usage: 1350 GB used, 79077 GB / 80428 GB avail + pgs: 3944 active+clean + + io: + client: 981 kB/s wr, 0 op/s rd, 84 op/s wr + +To verify that you are executing within the context of a pod. Display the +content of the */proc/self/cgroup* control group file. The *kubepods* output +in the cgroup file shows that you're executing in a docker container of a pod. + +:: + + [ceph-mon]# cat /proc/self/cgroup + 11:hugetlb:/kubepods/besteffort/podafb3689c-8c5b-11e8-be6a-246e96290f14/ff6cbc58348a44722ee6a493845b9c2903fabdce80d0902d217cc4d6962d7b53 + . . . + +To exit the pod and resume the orchestration node's execution context. + +:: + + [ceph-mon]# exit + [orchestration]# + +To verify that you are executing on the orchestration node's context, display +the */proc/self/cgroup* control group file. You would not see the *kubepods* +docker container in the output. + +:: + + [orchestration]# cat /proc/self/cgroup + 11:blkio:/user.slice + 10:freezer:/ + 9:hugetlb:/ + . . . + +It is also possible to run the ceph commands via the **kubectl exec** +without switching to a pod's container. + +:: + + [orchestration]# kubectl exec ceph-mon-9mpnb -n ceph -- ceph status + cluster: + id: 07c31d0f-bcc6-4db4-aadf-2d2a0f13edb8 + health: HEALTH_OK + . . . + + +Failure Domains +=============== + +A failure domain provides the fault isolation for the data and it corresponds +to a branch on the hierarchical topology. To protect against data loss, OSDs +that are allocated to PGs should be chosen from different failure +domains. Losing a branch takes down all the OSDs in that branch only and +OSDs in the other branches are not effected. + +In a data center, baremetal hosts are typically installed in a +rack (refrigerator size cabinet). Multiple racks with hosts in each rack +are used to provision the OSDs running on each host. A rack is envisioned +as a branch in the CRUSH topology. + +To provide data redundancy, ceph maintains multiple copies of the data. The +total number of copies to store for each piece of data is determined by the +ceph **osd_pool_default_size** ceph.conf parameter. With this parameter set +to 3, each piece of the data has 3 copies that gets stored in a pool. Each +copy is stored on different OSDs allocated from different failure domains. + +Host +---- + +Choosing host as the failure domain lacks all the protections against +data loss. + +To illustrate, a Ceph cluster has been provisioned with six hosts and four +OSDs on each host. The hosts are enclosed in respective racks where each +rack contains two hosts. + +In the configuration of the Ceph cluster, without explicit instructions on +where the host and rack buckets should be placed, Ceph would create a +CRUSH map without the rack bucket. A CRUSH rule that get created uses +the host as the failure domain. With the size (replica) of a pool set +to 3, the OSDs in all the PGs are allocated from different hosts. + +:: + + root=default + ├── host1 + │   ├── osd.1 + │   ├── osd.2 + │   ├── osd.3 + │   └── osd.4 + ├── host2 + │   ├── osd.5 + │   ├── osd.6 + │   ├── osd.7 + │   └── osd.8 + ├── host3 + │   ├── osd.9 + │   ├── osd.10 + │   ├── osd.11 + │   └── osd.12 + ├── host4 + │   ├── osd.13 + │   ├── osd.14 + │   ├── osd.15 + │   └── osd.16 + ├── host5 + │   ├── osd.17 + │   ├── osd.18 + │   ├── osd.19 + │   └── osd.20 + └── host6 + ├── osd.21 + ├── osd.22 + ├── osd.23 + └── osd.24 + +On this ceph cluster, it has a CRUSH rule that uses the host as the +failure domain. + +:: + + # ceph osd crush rule ls + replicated_host + # ceph osd crush rule dump replicated_host + { + "rule_id": 0, + "rule_name": "replicated_host", + "ruleset": 0, + "type": 1, + "min_size": 1, + "max_size": 10, + "steps": [ + { + "op": "take", + "item": -1, + "item_name": "default" + }, + { + "op": "chooseleaf_firstn", + "num": 0, + "type": "host" }, + { + "op": "emit" + } + ] + } + +Verify the CRUSH rule that is assigned to the ceph pool. In this +example, the rbd pool is used. + +:: + + # ceph osd pool get rbd crush_rule + crush_rule: replicated_host + # ceph osd pool get rbd size + size: 3 + # ceph osd pool get rbd pg_num + pg_num: 1024 + + +To verify that the OSDs in all the PGs are allocated from different +hosts, invoke the **utils-checkPGs.py** utility on the ceph pool. The offending +PGs are printed to stdout. + +:: + + # /tmp/utils-checkPGs.py rbd + Checking PGs in pool rbd ... Passed + +With host as the failure domain, quite possibly, some of the PGs might +have OSDs allocated from different hosts that are located in the same +rack. For example, one PG might have OSD numbers [1, 8, 13]. OSDs 1 and 8 +are found on hosts located in rack1. When rack1 suffers a catastrophe +failure, PGs with OSDs allocated from the hosts in rack1 would be severely +degraded. + +Rack +---- + +Choosing rack as the failure domain provides better protection against data +loss. + +To prevent PGs with OSDs allocated from hosts that are located in the same +rack, configure the CRUSH hierarchy with the rack buckets. In each rack +bucket, it contains the hosts that reside in the same physical rack. A +CRUSH Rule is configured with rack as the failure domain. + +In the following hierarchical topology, the Ceph cluster was configured with +three rack buckets. Each bucket has two hosts. In pools that were created +with the CRUSH rule set to rack, the OSDs in all the PGs are allocated from +the distinct rack. + +:: + + root=default + ├── rack1 + │   ├── host1 + │   │   ├── osd.1 + │   │   ├── osd.2 + │   │   ├── osd.3 + │   │   └── osd.4 + │   └── host2 + │   ├── osd.5 + │   ├── osd.6 + │   ├── osd.7 + │   └── osd.8 + ├── rack2 + │   ├── host3 + │   │   ├── osd.9 + │   │   ├── osd.10 + │   │   ├── osd.11 + │   │   └── osd.12 + │   └── host4 + │   ├── osd.13 + │   ├── osd.14 + │   ├── osd.15 + │   └── osd.16 + └── rack3 + ├── host5 + │   ├── osd.17 + │   ├── osd.18 + │   ├── osd.19 + │   └── osd.20 + └── host6 + ├── osd.21 + ├── osd.22 + ├── osd.23 + └── osd.24 + +Verify the Ceph cluster has a CRUSH rule with rack as the failure domain. + +:: + + # ceph osd crush rule ls + replicated_rack + # ceph osd crush rule dump replicated_rack + { + "rule_id": 2, + "rule_name": "replicated_rack", + "ruleset": 2, + "type": 1, + "min_size": 1, + "max_size": 10, + "steps": [ + { + "op": "take", + "item": -1, + "item_name": "default" + }, + { + "op": "chooseleaf_firstn", + "num": 0, + "type": "rack" + }, + { + "op": "emit" + } + ] + } + +Create a ceph pool with its CRUSH rule set to the rack's rule. + +:: + + # ceph osd pool create rbd 2048 2048 replicated replicated_rack + pool 'rbd' created + # ceph osd pool get rbd crush_rule + crush_rule: replicated_rack + # ceph osd pool get rbd size + size: 3 + # ceph osd pool get rbd pg_num + pg_num: 2048 + +Invoke the **utils-checkPGs.py** script on the pool to verify that there are no PGs +with OSDs allocated from the same rack. The offending PGs are printed to +stdout. + +:: + + # /tmp/utils-checkPGs.py rbd + Checking PGs in pool rbd ... Passed + + +CRUSH Map and Rule +================== + +On a properly configured Ceph cluster, there are different ways to view +the CRUSH hierarchy. + +ceph CLI +-------- + +Print to stdout the CRUSH hierarchy with the ceph CLI. + +:: + + root@host5:/# ceph osd crush tree + ID CLASS WEIGHT TYPE NAME + -1 78.47974 root default + -15 26.15991 rack rack1 + -2 13.07996 host host1 + 0 hdd 1.09000 osd.0 + 1 hdd 1.09000 osd.1 + 2 hdd 1.09000 osd.2 + 3 hdd 1.09000 osd.3 + 4 hdd 1.09000 osd.4 + 5 hdd 1.09000 osd.5 + 6 hdd 1.09000 osd.6 + 7 hdd 1.09000 osd.7 + 8 hdd 1.09000 osd.8 + 9 hdd 1.09000 osd.9 + 10 hdd 1.09000 osd.10 + 11 hdd 1.09000 osd.11 + -5 13.07996 host host2 + 12 hdd 1.09000 osd.12 + 13 hdd 1.09000 osd.13 + 14 hdd 1.09000 osd.14 + 15 hdd 1.09000 osd.15 + 16 hdd 1.09000 osd.16 + 17 hdd 1.09000 osd.17 + 18 hdd 1.09000 osd.18 + 19 hdd 1.09000 osd.19 + 20 hdd 1.09000 osd.20 + 21 hdd 1.09000 osd.21 + 22 hdd 1.09000 osd.22 + 23 hdd 1.09000 osd.23 + -16 26.15991 rack rack2 + -13 13.07996 host host3 + 53 hdd 1.09000 osd.53 + 54 hdd 1.09000 osd.54 + 58 hdd 1.09000 osd.58 + 59 hdd 1.09000 osd.59 + 64 hdd 1.09000 osd.64 + 65 hdd 1.09000 osd.65 + 66 hdd 1.09000 osd.66 + 67 hdd 1.09000 osd.67 + 68 hdd 1.09000 osd.68 + 69 hdd 1.09000 osd.69 + 70 hdd 1.09000 osd.70 + 71 hdd 1.09000 osd.71 + -9 13.07996 host host4 + 36 hdd 1.09000 osd.36 + 37 hdd 1.09000 osd.37 + 38 hdd 1.09000 osd.38 + 39 hdd 1.09000 osd.39 + 40 hdd 1.09000 osd.40 + 41 hdd 1.09000 osd.41 + 42 hdd 1.09000 osd.42 + 43 hdd 1.09000 osd.43 + 44 hdd 1.09000 osd.44 + 45 hdd 1.09000 osd.45 + 46 hdd 1.09000 osd.46 + 47 hdd 1.09000 osd.47 + -17 26.15991 rack rack3 + -11 13.07996 host host5 + 48 hdd 1.09000 osd.48 + 49 hdd 1.09000 osd.49 + 50 hdd 1.09000 osd.50 + 51 hdd 1.09000 osd.51 + 52 hdd 1.09000 osd.52 + 55 hdd 1.09000 osd.55 + 56 hdd 1.09000 osd.56 + 57 hdd 1.09000 osd.57 + 60 hdd 1.09000 osd.60 + 61 hdd 1.09000 osd.61 + 62 hdd 1.09000 osd.62 + 63 hdd 1.09000 osd.63 + -7 13.07996 host host6 + 24 hdd 1.09000 osd.24 + 25 hdd 1.09000 osd.25 + 26 hdd 1.09000 osd.26 + 27 hdd 1.09000 osd.27 + 28 hdd 1.09000 osd.28 + 29 hdd 1.09000 osd.29 + 30 hdd 1.09000 osd.30 + 31 hdd 1.09000 osd.31 + 32 hdd 1.09000 osd.32 + 33 hdd 1.09000 osd.33 + 34 hdd 1.09000 osd.34 + 35 hdd 1.09000 osd.35 + root@host5:/# + +To see weight and affinity of each OSD. + +:: + + root@host5:/# ceph osd tree + ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF + -1 78.47974 root default + -15 26.15991 rack rack1 + -2 13.07996 host host1 + 0 hdd 1.09000 osd.0 up 1.00000 1.00000 + 1 hdd 1.09000 osd.1 up 1.00000 1.00000 + 2 hdd 1.09000 osd.2 up 1.00000 1.00000 + 3 hdd 1.09000 osd.3 up 1.00000 1.00000 + 4 hdd 1.09000 osd.4 up 1.00000 1.00000 + 5 hdd 1.09000 osd.5 up 1.00000 1.00000 + 6 hdd 1.09000 osd.6 up 1.00000 1.00000 + 7 hdd 1.09000 osd.7 up 1.00000 1.00000 + 8 hdd 1.09000 osd.8 up 1.00000 1.00000 + 9 hdd 1.09000 osd.9 up 1.00000 1.00000 + 10 hdd 1.09000 osd.10 up 1.00000 1.00000 + 11 hdd 1.09000 osd.11 up 1.00000 1.00000 + -5 13.07996 host host2 + 12 hdd 1.09000 osd.12 up 1.00000 1.00000 + 13 hdd 1.09000 osd.13 up 1.00000 1.00000 + 14 hdd 1.09000 osd.14 up 1.00000 1.00000 + 15 hdd 1.09000 osd.15 up 1.00000 1.00000 + 16 hdd 1.09000 osd.16 up 1.00000 1.00000 + 17 hdd 1.09000 osd.17 up 1.00000 1.00000 + 18 hdd 1.09000 osd.18 up 1.00000 1.00000 + 19 hdd 1.09000 osd.19 up 1.00000 1.00000 + 20 hdd 1.09000 osd.20 up 1.00000 1.00000 + 21 hdd 1.09000 osd.21 up 1.00000 1.00000 + 22 hdd 1.09000 osd.22 up 1.00000 1.00000 + 23 hdd 1.09000 osd.23 up 1.00000 1.00000 + + +crushtool CLI +------------- + +To extract the CRUSH Map from a running cluster and convert it into ascii text. + +:: + + # ceph osd getcrushmap -o /tmp/cm.bin + 100 + # crushtool -d /tmp/cm.bin -o /tmp/cm.rack.ascii + # cat /tmp/cm.rack.ascii + . . . + # buckets + host host1 { + id -2 # do not change unnecessarily + id -3 class hdd # do not change unnecessarily + # weight 13.080 + alg straw2 + hash 0 # rjenkins1 + item osd.0 weight 1.090 + item osd.1 weight 1.090 + item osd.2 weight 1.090 + item osd.3 weight 1.090 + item osd.4 weight 1.090 + item osd.5 weight 1.090 + item osd.6 weight 1.090 + item osd.7 weight 1.090 + item osd.8 weight 1.090 + item osd.9 weight 1.090 + item osd.10 weight 1.090 + item osd.11 weight 1.090 + } + host host2 { + id -5 # do not change unnecessarily + id -6 class hdd # do not change unnecessarily + # weight 13.080 + alg straw2 + hash 0 # rjenkins1 + item osd.12 weight 1.090 + item osd.13 weight 1.090 + item osd.14 weight 1.090 + item osd.15 weight 1.090 + item osd.16 weight 1.090 + item osd.18 weight 1.090 + item osd.19 weight 1.090 + item osd.17 weight 1.090 + item osd.20 weight 1.090 + item osd.21 weight 1.090 + item osd.22 weight 1.090 + item osd.23 weight 1.090 + } + rack rack1 { + id -15 # do not change unnecessarily + id -20 class hdd # do not change unnecessarily + # weight 26.160 + alg straw2 + hash 0 # rjenkins1 + item host1 weight 13.080 + item host2 weight 13.080 + } + . . . + root default { + id -1 # do not change unnecessarily + id -4 class hdd # do not change unnecessarily + # weight 78.480 + alg straw2 + hash 0 # rjenkins1 + item rack1 weight 26.160 + item rack2 weight 26.160 + item rack3 weight 26.160 + } + + # rules + rule replicated_rack { + id 2 + type replicated + min_size 1 + max_size 10 + step take default + step chooseleaf firstn 0 type rack + step emit + } + # end crush map + +The **utils-checkPGs.py** script can read the same data from memory and construct +the failure domains with OSDs. Verify the OSDs in each PG against the +constructed failure domains. + +You can edit the **/tmp/cm.rack.ascii** to modify the CRUSH Map. Compile +the modified ascii file into binary that has the new CRUSH Map. To set +the running ceph cluster with the new CRUSH Map, execute the following +commands on one of the monitor nodes: + +:: + + # vi /tmp/cm.rack.ascii + # crushtool -c /tmp/cm.rack.ascii -o /tmp/cm.bin.new + # ceph osd setcrushmap -i /tmp/cm.bin.new + # watch ceph status + +.. NOTE:: + + You have to know the CRUSH Map syntax really well in order for you to be able to manually edit the ascii file. + +Buckets +------- + +You have a pre-existing Ceph cluster that did not have the rack +buckets. You want to restructure the CRUSH hierarchy with the rack +buckets to a topology that is similar to the one presented earlier in +this guide. + +:: + + root@host3:/# ceph osd crush tree + ID CLASS WEIGHT TYPE NAME + -1 78.47974 root default + -2 13.07996 host host1 + 0 hdd 1.09000 osd.0 + 1 hdd 1.09000 osd.1 + 2 hdd 1.09000 osd.2 + 3 hdd 1.09000 osd.3 + 4 hdd 1.09000 osd.4 + 5 hdd 1.09000 osd.5 + 6 hdd 1.09000 osd.6 + 7 hdd 1.09000 osd.7 + 8 hdd 1.09000 osd.8 + 9 hdd 1.09000 osd.9 + 10 hdd 1.09000 osd.10 + 11 hdd 1.09000 osd.11 + -5 13.07996 host host2 + 12 hdd 1.09000 osd.12 + 13 hdd 1.09000 osd.13 + 14 hdd 1.09000 osd.14 + 15 hdd 1.09000 osd.15 + 16 hdd 1.09000 osd.16 + 17 hdd 1.09000 osd.17 + 18 hdd 1.09000 osd.18 + 19 hdd 1.09000 osd.19 + 20 hdd 1.09000 osd.20 + 21 hdd 1.09000 osd.21 + 22 hdd 1.09000 osd.22 + 23 hdd 1.09000 osd.23 + -13 13.07996 host host3 + 60 hdd 1.09000 osd.60 + 61 hdd 1.09000 osd.61 + 62 hdd 1.09000 osd.62 + 63 hdd 1.09000 osd.63 + 64 hdd 1.09000 osd.64 + 65 hdd 1.09000 osd.65 + 66 hdd 1.09000 osd.66 + 67 hdd 1.09000 osd.67 + 68 hdd 1.09000 osd.68 + 69 hdd 1.09000 osd.69 + 70 hdd 1.09000 osd.70 + 71 hdd 1.09000 osd.71 + -9 13.07996 host host4 + 36 hdd 1.09000 osd.36 + 37 hdd 1.09000 osd.37 + 38 hdd 1.09000 osd.38 + 39 hdd 1.09000 osd.39 + 40 hdd 1.09000 osd.40 + 41 hdd 1.09000 osd.41 + 42 hdd 1.09000 osd.42 + 43 hdd 1.09000 osd.43 + 44 hdd 1.09000 osd.44 + 45 hdd 1.09000 osd.45 + 46 hdd 1.09000 osd.46 + 47 hdd 1.09000 osd.47 + -11 13.07996 host host5 + 48 hdd 1.09000 osd.48 + 49 hdd 1.09000 osd.49 + 50 hdd 1.09000 osd.50 + 51 hdd 1.09000 osd.51 + 52 hdd 1.09000 osd.52 + 53 hdd 1.09000 osd.53 + 54 hdd 1.09000 osd.54 + 55 hdd 1.09000 osd.55 + 56 hdd 1.09000 osd.56 + 57 hdd 1.09000 osd.57 + 58 hdd 1.09000 osd.58 + 59 hdd 1.09000 osd.59 + -7 13.07996 host host6 + 24 hdd 1.09000 osd.24 + 25 hdd 1.09000 osd.25 + 26 hdd 1.09000 osd.26 + 27 hdd 1.09000 osd.27 + 28 hdd 1.09000 osd.28 + 29 hdd 1.09000 osd.29 + 30 hdd 1.09000 osd.30 + 31 hdd 1.09000 osd.31 + 32 hdd 1.09000 osd.32 + 33 hdd 1.09000 osd.33 + 34 hdd 1.09000 osd.34 + 35 hdd 1.09000 osd.35 + root@host3:/# + +To include the rack bucket in the CRUSH Map, follow these steps. First, add +the required rack buckets with the user-defined names. + +:: + + root@host5:/# ceph osd crush add-bucket rack1 rack + added bucket rack1 type rack to crush map + root@host5:/# ceph osd crush add-bucket rack2 rack + added bucket rack2 type rack to crush map + root@host5:/# ceph osd crush add-bucket rack3 rack + added bucket rack3 type rack to crush map + root@host5:/# ceph osd tree + ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF + -17 0 rack rack3 + -16 0 rack rack2 + -15 0 rack rack1 + -1 78.47974 root default + . . . + +Move the hosts to the respective rack buckets. + +:: + + root@host5:/# ceph osd crush move host1 rack=rack1 + moved item id -2 name 'host1' to location {rack=rack1} in crush map + root@host5:/# ceph osd crush move host2 rack=rack1 + moved item id -5 name 'host2' to location {rack=rack1} in crush map + +Move the newly created rack rack1 to the root bucket. Verify the new +hierarchy with the ceph CLI. + +:: + + root@host5:/# ceph osd crush move rack1 root=default + moved item id -15 name 'rack1' to location {root=default} in crush map + root@host5:/# ceph osd tree + ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF + -17 0 rack rack3 + -16 0 rack rack2 + -1 78.47974 root default + -15 26.15991 rack rack1 + -2 13.07996 host host1 + 0 hdd 1.09000 osd.0 up 1.00000 1.00000 + 1 hdd 1.09000 osd.1 up 1.00000 1.00000 + 2 hdd 1.09000 osd.2 up 1.00000 1.00000 + 3 hdd 1.09000 osd.3 up 1.00000 1.00000 + 4 hdd 1.09000 osd.4 up 1.00000 1.00000 + 5 hdd 1.09000 osd.5 up 1.00000 1.00000 + 6 hdd 1.09000 osd.6 up 1.00000 1.00000 + 7 hdd 1.09000 osd.7 up 1.00000 1.00000 + 8 hdd 1.09000 osd.8 up 1.00000 1.00000 + 9 hdd 1.09000 osd.9 up 1.00000 1.00000 + 10 hdd 1.09000 osd.10 up 1.00000 1.00000 + 11 hdd 1.09000 osd.11 up 1.00000 1.00000 + -5 13.07996 host host2 + 12 hdd 1.09000 osd.12 up 1.00000 1.00000 + 13 hdd 1.09000 osd.13 up 1.00000 1.00000 + 14 hdd 1.09000 osd.14 up 1.00000 1.00000 + 15 hdd 1.09000 osd.15 up 1.00000 1.00000 + 16 hdd 1.09000 osd.16 up 1.00000 1.00000 + 17 hdd 1.09000 osd.17 up 1.00000 1.00000 + 18 hdd 1.09000 osd.18 up 1.00000 1.00000 + 19 hdd 1.09000 osd.19 up 1.00000 1.00000 + 20 hdd 1.09000 osd.20 up 1.00000 1.00000 + 21 hdd 1.09000 osd.21 up 1.00000 1.00000 + 22 hdd 1.09000 osd.22 up 1.00000 1.00000 + 23 hdd 1.09000 osd.23 up 1.00000 1.00000 + . . . + +Repeat the same for rack2. + +:: + + root@host5:/# ceph osd crush move host3 rack=rack2 + moved item id -13 name 'host3' to location {rack=rack2} in crush map + root@host5:/# ceph osd crush move host4 rack=rack2 + moved item id -9 name 'host4' to location {rack=rack2} in crush map + root@host5:/# ceph osd crush move rack2 root=default + moved item id -16 name 'rack2' to location {root=default} in crush map + +Repeat the same for rack3. + +:: + + root@host5:/# ceph osd crush move host5 rack=rack3 + moved item id -11 name 'host5' to location {rack=rack3} in crush map + root@host5:/# ceph osd crush move host6 rack=rack3 + moved item id -7 name 'host6' to location {rack=rack3} in crush map + root@host5:/# ceph osd crush move rack3 root=default + moved item id -17 name 'rack3' to location {root=default} in crush map + +Extract the CRUSH Map from the in-memory copy and verify. + +:: + + root@host5:/# ceph osd getcrushmap -o /tmp/cm.bin.racks.6 + 100 + root@host5:/# crushtool -d /tmp/cm.bin.racks.6 -o /tmp/cm.ascii.racks.6 + root@host5:/# cat /tmp/cm.ascii.racks.6 + . . . + # buckets + host host1 { + id -2 # do not change unnecessarily + id -3 class hdd # do not change unnecessarily + # weight 13.080 + alg straw2 + hash 0 # rjenkins1 + item osd.0 weight 1.090 + item osd.1 weight 1.090 + item osd.2 weight 1.090 + item osd.3 weight 1.090 + item osd.4 weight 1.090 + item osd.5 weight 1.090 + item osd.6 weight 1.090 + item osd.7 weight 1.090 + item osd.8 weight 1.090 + item osd.9 weight 1.090 + item osd.10 weight 1.090 + item osd.11 weight 1.090 + } + host host2 { + id -5 # do not change unnecessarily + id -6 class hdd # do not change unnecessarily + # weight 13.080 + alg straw2 + hash 0 # rjenkins1 + item osd.12 weight 1.090 + item osd.13 weight 1.090 + item osd.14 weight 1.090 + item osd.15 weight 1.090 + item osd.16 weight 1.090 + item osd.18 weight 1.090 + item osd.19 weight 1.090 + item osd.17 weight 1.090 + item osd.20 weight 1.090 + item osd.21 weight 1.090 + item osd.22 weight 1.090 + item osd.23 weight 1.090 + } + rack rack1 { + id -15 # do not change unnecessarily + id -20 class hdd # do not change unnecessarily + # weight 26.160 + alg straw2 + hash 0 # rjenkins1 + item host1 weight 13.080 + item host2 weight 13.080 + } + host host3 { + id -13 # do not change unnecessarily + id -14 class hdd # do not change unnecessarily + # weight 13.080 + alg straw2 + hash 0 # rjenkins1 + item osd.53 weight 1.090 + item osd.54 weight 1.090 + item osd.58 weight 1.090 + item osd.59 weight 1.090 + item osd.64 weight 1.090 + item osd.65 weight 1.090 + item osd.66 weight 1.090 + item osd.67 weight 1.090 + item osd.69 weight 1.090 + item osd.68 weight 1.090 + item osd.71 weight 1.090 + item osd.70 weight 1.090 + } + host host4 { + id -9 # do not change unnecessarily + id -10 class hdd # do not change unnecessarily + # weight 13.080 + alg straw2 + hash 0 # rjenkins1 + item osd.36 weight 1.090 + item osd.37 weight 1.090 + item osd.38 weight 1.090 + item osd.39 weight 1.090 + item osd.40 weight 1.090 + item osd.41 weight 1.090 + item osd.42 weight 1.090 + item osd.44 weight 1.090 + item osd.45 weight 1.090 + item osd.46 weight 1.090 + item osd.47 weight 1.090 + item osd.43 weight 1.090 + } + rack rack2 { + id -16 # do not change unnecessarily + id -19 class hdd # do not change unnecessarily + # weight 26.160 + alg straw2 + hash 0 # rjenkins1 + item host3 weight 13.080 + item host4 weight 13.080 + } + host host5 { + id -11 # do not change unnecessarily + id -12 class hdd # do not change unnecessarily + # weight 13.080 + alg straw2 + hash 0 # rjenkins1 + item osd.49 weight 1.090 + item osd.48 weight 1.090 + item osd.50 weight 1.090 + item osd.51 weight 1.090 + item osd.52 weight 1.090 + item osd.55 weight 1.090 + item osd.56 weight 1.090 + item osd.57 weight 1.090 + item osd.60 weight 1.090 + item osd.61 weight 1.090 + item osd.62 weight 1.090 + item osd.63 weight 1.090 + } + host host6 { + id -7 # do not change unnecessarily + id -8 class hdd # do not change unnecessarily + # weight 13.080 + alg straw2 + hash 0 # rjenkins1 + item osd.24 weight 1.090 + item osd.25 weight 1.090 + item osd.26 weight 1.090 + item osd.27 weight 1.090 + item osd.28 weight 1.090 + item osd.29 weight 1.090 + item osd.30 weight 1.090 + item osd.31 weight 1.090 + item osd.32 weight 1.090 + item osd.33 weight 1.090 + item osd.34 weight 1.090 + item osd.35 weight 1.090 + } + rack rack3 { + id -17 # do not change unnecessarily + id -18 class hdd # do not change unnecessarily + # weight 26.160 + alg straw2 + hash 0 # rjenkins1 + item host5 weight 13.080 + item host6 weight 13.080 + } + root default { + id -1 # do not change unnecessarily + id -4 class hdd # do not change unnecessarily + # weight 78.480 + alg straw2 + hash 0 # rjenkins1 + item rack1 weight 26.160 + item rack2 weight 26.160 + item rack3 weight 26.160 + } + + # rules + rule replicated_rule { + id 0 + type replicated + min_size 1 + max_size 10 + step take default + step chooseleaf firstn 0 type host + step emit + } + rule same_host { + id 1 + type replicated + min_size 1 + max_size 10 + step take default + step choose firstn 0 type osd + step emit + } + rule replicated_rack { + id 2 + type replicated + min_size 1 + max_size 10 + step take default + step chooseleaf firstn 0 type rack + step emit + } + + # end crush map + root@host5:/# + +Create a CRUSH Rule with rack as the failure domain. + +:: + + root@host5:/# ceph osd crush rule create-replicated replicated_rack default rack + +Create a ceph pool that uses the new CRUSH Rule. + +:: + + root@host5:/# ceph osd pool create cmTestPool 2048 2048 replicated replicated_rack + pool 'cmTestPool' created + root@host5:/# /tmp/utils-checkPGs.py cmTestPool + Checking PGs in pool cmTestPool ... Passed + + +utils-checkPGs.py Script +======================== + +The purpose of the **utils-checkPGs.py** script is to check whether a PG has OSDs +allocated from the same failure domain. The violating PGs with their +respective OSDs are printed to the stdout. + +In this example, a pool was created with the CRUSH rule set to the host +failure domain. The ceph cluster was configured with the rack +buckets. The CRUSH algorithm allocated the OSDs from different hosts +in each PG. The rack buckets were ignored and thus the duplicate +racks which get reported by the script. + +:: + + root@host5:/# /tmp/utils-checkPGs.py cmTestPool + Checking PGs in pool cmTestPool ... Failed + OSDs [44, 32, 53] in PG 20.a failed check in rack [u'rack2', u'rack2', u'rack2'] + OSDs [61, 5, 12] in PG 20.19 failed check in rack [u'rack1', u'rack1', u'rack1'] + OSDs [69, 9, 15] in PG 20.2a failed check in rack [u'rack1', u'rack1', u'rack1'] + . . . + + +.. NOTE:: + + The **utils-checkPGs.py** utility is executed on-demand. It is intended to be executed on one of the ceph-mon pods. + +If the **utils-checkPGs.py** script did not find any violation, it prints +Passed. In this example, the ceph cluster was configured with the rack +buckets. The rbd pool was created with its CRUSH rule set to the +rack. The **utils-checkPGs.py** script did not find duplicate racks in PGs. + +:: + + root@host5:/# /tmp/utils-checkPGs.py rbd + Checking PGs in pool rbd ... Passed + +Invoke the **utils-checkPGs.py** script with the --help option to get the +script's usage. + +:: + + root@host5:/# /tmp/utils-checkPGs.py --help + usage: utils-checkPGs.py [-h] PoolName [PoolName ...] + + Cross-check the OSDs assigned to the Placement Groups (PGs) of a ceph pool + with the CRUSH topology. The cross-check compares the OSDs in a PG and + verifies the OSDs reside in separate failure domains. PGs with OSDs in + the same failure domain are flagged as violation. The offending PGs are + printed to stdout. + + This CLI is executed on-demand on a ceph-mon pod. To invoke the CLI, you + can specify one pool or list of pools to check. The special pool name + All (or all) checks all the pools in the ceph cluster. + + positional arguments: + PoolName List of pools (or All) to validate the PGs and OSDs mapping + + optional arguments: + -h, --help show this help message and exit + root@host5:/# + + +The source for the **utils-checkPGs.py** script is available +at **openstack-helm/ceph-mon/templates/bin/utils/_checkPGs.py.tpl**. + +Ceph Deployments +================ + +Through testing and verification, you derive at a CRUSH Map with the buckets +that are deemed beneficial to your ceph cluster. Standardize on the verified +CRUSH map to have the consistency in all the Ceph deployments across the +data centers. + +Mimicking the hierarchy in your CRUSH Map with the physical hardware setup +should provide the needed information on the topology layout. With the +racks layout, each rack can store a replica of your data. + +To validate a ceph cluster with the number of replica that is based on +the number of racks: + +#. The number of physical racks and the number of replicas are 3, respectively. Create a ceph pool with replica set to 3 and pg_num set to (# of OSDs * 50) / 3 and round the number to the next power-of-2. For example, if the calculation is 240, round it to 256. Assuming the pool you just created had 256 PGs. In each PG, verify the OSDs are chosen from the three racks, respectively. Use the **utils-checkPGs.py** script to verify the OSDs in all the PGs of the pool. + +#. The number of physical racks is 2 and the number of replica is 3. Create a ceph pool as described in the previous step. In the pool you created, in each PG, verify two of the OSDs are chosen from the two racks, respectively. The third OSD can come from one of the two racks but not from the same hosts as the other two OSDs. + +Data Movement +============= + +Changes to the CRUSH Map always trigger data movement. It is prudent that +you plan accordingly when restructuring the CRUSH Map. Once started, the +CRUSH Map restructuring runs to completion and can neither be stopped nor +suspended. On a busy Ceph cluster with live transactions, it is always +safer to use divide-and-conquer approach to complete small chunk of works +in multiple sessions. + +Watch the progress of the data movement while the Ceph cluster re-balances +itself. + +:: + + # watch ceph status + cluster: + id: 07c31d0f-bcc6-4db4-aadf-2d2a0f13edb8 + health: HEALTH_WARN + 137084/325509 objects misplaced (42.114%) + Degraded data redundancy: 28/325509 objects degraded (0.009%), 15 pgs degraded + + services: + mon: 5 daemons, quorum host1,host2,host3,host4,host5 + mgr: host6(active), standbys: host1 + mds: cephfs-1/1/1 up {0=mds-ceph-mds-7cb4f57cc-prh87=up:active}, 1 up:standby + osd: 72 osds: 72 up, 72 in; 815 remapped pgs + rgw: 2 daemons active + + data: + pools: 19 pools, 2920 pgs + objects: 105k objects, 408 GB + usage: 1609 GB used, 78819 GB / 80428 GB avail + pgs: 28/325509 objects degraded (0.009%) + 137084/325509 objects misplaced (42.114%) + 2085 active+clean + 790 active+remapped+backfill_wait + 22 active+remapped+backfilling + 15 active+recovery_wait+degraded + 4 active+recovery_wait+remapped + 4 active+recovery_wait + + io: + client: 11934 B/s rd, 3731 MB/s wr, 2 op/s rd, 228 kop/s wr + recovery: 636 MB/s, 163 objects/s + +At the time this **ceph status** command was executed, the status's output +showed that the ceph cluster was going through re-balancing. Among the +overall 2920 pgs, 2085 of them are in **active+clean** state. The +remaining pgs are either being remapped or recovered. As the ceph +cluster continues its re-balance, the number of pgs +in **active+clean** increases. + +:: + + # ceph status + cluster: + id: 07c31d0f-bcc6-4db4-aadf-2d2a0f13edb8 + health: HEALTH_OK + + services: + mon: 5 daemons, quorum host1,host2,host3,host4,host5 + mgr: host6(active), standbys: host1 + mds: cephfs-1/1/1 up {0=mds-ceph-mds-7cc55c9695-lj22d=up:active}, 1 up:standby + osd: 72 osds: 72 up, 72 in + rgw: 2 daemons active + + data: + pools: 19 pools, 2920 pgs + objects: 134k objects, 519 GB + usage: 1933 GB used, 78494 GB / 80428 GB avail + pgs: 2920 active+clean + + io: + client: 1179 B/s rd, 971 kB/s wr, 1 op/s rd, 41 op/s wr + +When the overall number of pgs is equal to the number +of **active+clean** pgs, the health of the ceph cluster changes +to **HEALTH_OK** (assuming there are no other warning conditions). diff --git a/doc/source/testing/ceph-resiliency/index.rst b/doc/source/testing/ceph-resiliency/index.rst new file mode 100644 index 000000000..c93958e87 --- /dev/null +++ b/doc/source/testing/ceph-resiliency/index.rst @@ -0,0 +1,9 @@ +=============== +Ceph Resiliency +=============== + +.. toctree:: + :maxdepth: 2 + + README + failure-domain diff --git a/doc/source/testing/index.rst b/doc/source/testing/index.rst new file mode 100644 index 000000000..a48c2cc22 --- /dev/null +++ b/doc/source/testing/index.rst @@ -0,0 +1,8 @@ +======= +Testing +======= + +.. toctree:: + :maxdepth: 2 + + ceph-resiliency/index