Add failure domains, and device classes for custom CRUSH rules

Largely inspired and taken from Kranthi's PS. - Add support for creating custom CRUSH rules based off of failure domains and device classes (ssd & hdd) - Basic logic around the PG calculator to autodetect the number of OSDs globally and per device class (required when using custom crush rules that specify device classes). Change-Id: I13a6f5eb21494746c2b77e340e8d0dcb0d81a591
2018-11-23 14:59:37 -06:00 · 2018-11-23 14:59:37 -06:00 · 6e8c289c13
commit 6e8c289c13
parent 8e369d2c9c
6 changed files with 151 additions and 52 deletions
--- a/ceph-client/templates/bin/pool/_init.sh.tpl
+++ b/ceph-client/templates/bin/pool/_init.sh.tpl
@ -20,8 +20,6 @@ set -ex
 export LC_ALL=C

 : "${ADMIN_KEYRING:=/etc/ceph/${CLUSTER}.client.admin.keyring}"
-: "${OSD_TARGET_PGS:=100}"
-: "${QUANTITY_OSDS:=15}"

 if [[ ! -e /etc/ceph/${CLUSTER}.conf ]]; then
  echo "ERROR- /etc/ceph/${CLUSTER}.conf must exist; get it from your existing mon"
@ -33,13 +31,21 @@ if [[ ! -e ${ADMIN_KEYRING} ]]; then
   exit 1
 fi

-if ! ceph --cluster "${CLUSTER}" osd crush rule ls | grep -q "^same_host$"; then
-  ceph --cluster "${CLUSTER}" osd crush rule create-simple same_host default osd
+function create_crushrule () {
+  CRUSH_NAME=$1
+  CRUSH_RULE=$2
+  CRUSH_FAILURE_DOMAIN=$3
+  CRUSH_DEVICE_CLASS=$4
+  if ! ceph --cluster "${CLUSTER}" osd crush rule ls | grep -q "^\$CRUSH_NAME$"; then
+    ceph --cluster "${CLUSTER}" osd crush rule $CRUSH_RULE $CRUSH_NAME default $CRUSH_FAILURE_DOMAIN $CRUSH_DEVICE_CLASS || true
  fi
+}

-if ! ceph --cluster "${CLUSTER}" osd crush rule ls | grep -q "^rack_replicated_rule$"; then
-  ceph --cluster "${CLUSTER}" osd crush rule create-simple rack_replicated_rule default rack
-fi
+{{- range $crush_rule := .Values.conf.pool.crush_rules -}}
+{{- with $crush_rule }}
+create_crushrule {{ .name }} {{ .crush_rule }} {{ .failure_domain }} {{ .device_class }}
+{{- end }}
+{{- end }}

 function reweight_osds () {
  for OSD_ID in $(ceph --cluster "${CLUSTER}" osd df | awk '$3 == "0" {print $1}'); do
@ -105,28 +111,35 @@ function manage_pool () {
  POOL_APPLICATION=$1
  POOL_NAME=$2
  POOL_REPLICATION=$3
-  TOTAL_OSDS=$4
-  TOTAL_DATA_PERCENT=$5
-  TARGET_PG_PER_OSD=$6
-  POOL_CRUSH_RULE=$7
-  POOL_PROTECTION=$8
+  TOTAL_DATA_PERCENT=$4
+  TARGET_PG_PER_OSD=$5
+  POOL_CRUSH_RULE=$6
+  POOL_PROTECTION=$7
+  TOTAL_OSDS=$(ceph --cluster "${CLUSTER}" osd ls | wc -l)
+  if (ceph --cluster "${CLUSTER}" osd crush rule dump "${POOL_CRUSH_RULE}" | awk '/item_name/' | grep -q ssd); then
+    TOTAL_OSDS=$(ceph --cluster "${CLUSTER}" osd tree | grep "ssd" | wc -l)
+  elif (ceph --cluster "${CLUSTER}" osd crush rule dump "${POOL_CRUSH_RULE}" | awk '/item_name/' | grep -q hdd); then
+    TOTAL_OSDS=$(ceph --cluster "${CLUSTER}" osd tree | grep "hdd" | wc -l)
+  fi
  POOL_PLACEMENT_GROUPS=$(/tmp/pool-calc.py ${POOL_REPLICATION} ${TOTAL_OSDS} ${TOTAL_DATA_PERCENT} ${TARGET_PG_PER_OSD})
  create_pool "${POOL_APPLICATION}" "${POOL_NAME}" "${POOL_REPLICATION}" "${POOL_PLACEMENT_GROUPS}" "${POOL_CRUSH_RULE}" "${POOL_PROTECTION}"
 }

 reweight_osds

-{{ $targetNumOSD := .Values.conf.pool.target.osd }}
 {{ $targetPGperOSD := .Values.conf.pool.target.pg_per_osd }}
 {{ $crushRuleDefault := .Values.conf.pool.default.crush_rule }}
 {{ $targetProtection := .Values.conf.pool.target.protected | default "false" | quote | lower }}
 {{- range $pool := .Values.conf.pool.spec -}}
 {{- with $pool }}
-manage_pool {{ .application }} {{ .name }} {{ .replication }} {{ $targetNumOSD }} {{ .percent_total_data }} {{ $targetPGperOSD }} {{ $crushRuleDefault }} {{ $targetProtection }}
+{{- if .crush_rule }}
+manage_pool {{ .application }} {{ .name }} {{ .replication }} {{ .percent_total_data }} {{ $targetPGperOSD }} {{ .crush_rule }} {{ $targetProtection }}
+{{ else }}
+manage_pool {{ .application }} {{ .name }} {{ .replication }} {{ .percent_total_data }} {{ $targetPGperOSD }} {{ $crushRuleDefault }} {{ $targetProtection }}
+{{- end }}
 {{- end }}
 {{- end }}

 {{- if .Values.conf.pool.crush.tunables }}
 ceph --cluster "${CLUSTER}" osd crush tunables {{ .Values.conf.pool.crush.tunables }}
 {{- end }}
-
--- a/ceph-client/values.yaml
+++ b/ceph-client/values.yaml
@ -123,17 +123,48 @@ conf:
      tunables: null
    target:
      #NOTE(portdirect): arbitrarily we set the default number of expected OSD's to 5
-      # to match the number of nodes in the OSH gate.
+      # to match the number of nodes in the OSH gate (used only for helm tests).
      osd: 5
      pg_per_osd: 100
      protected: true
    default:
-      # NOTE(supamatt): Accepted values are:
-      # same_host for a single node
-      # replicated_rule for a multi node
-      # rack_replicated_rule for a multi node in multiple (>=3) racks
-      # Ceph cluster must be in a healthy state.
+      # NOTE(supamatt): Accepted values are taken from `crush_rules` list.
      crush_rule: replicated_rule
+    crush_rules:
+      # NOTE(supamatt): Device classes must remain undefined if all OSDs are the
+      # same device type of backing disks (ie, all HDD or all SDD).
+      - name: same_host
+        crush_rule: create-simple
+        failure_domain: osd
+        device_class:
+      - name: replicated_rule
+        crush_rule: create-simple
+        failure_domain: host
+        device_class:
+      - name: rack_replicated_rule
+        crush_rule: create-simple
+        failure_domain: rack
+        device_class:
+      # - name: replicated_rule-ssd
+      #   crush_rule: create-replicated
+      #   failure_domain: host
+      #   device_class: sdd
+      # - name: replicated_rule-hdd
+      #   crush_rule: create-replicated
+      #   failure_domain: host
+      #   device_class: hdd
+      # - name: rack_replicated_rule-ssd
+      #   crush_rule: create-replicated
+      #   failure_domain: rack
+      #   device_class: ssd
+      # - name: rack_replicated_rule-hdd
+      #   crush_rule: create-replicated
+      #   failure_domain: rack
+      #   device_class: hdd
+      # - name: row_replicated_rule
+      #   crush_rule: create-simple
+      #   failure_domain: row
+      #   device_class:

    # NOTE(portdirect): this section describes the pools that will be managed by
    # the ceph pool management job, as it tunes the pgs and crush rule, based on
@ -144,6 +175,10 @@ conf:
        application: rbd
        replication: 3
        percent_total_data: 40
+      # NOTE(supamatt): By default the crush rules used to create each pool will be
+      # taken from the pool default `crush_rule` unless a pool specific `crush_rule`
+      # is specified. The rule MUST exist for it to be defined here.
+      #  crush_rule: replicated_rule
      # CephFS pools
      - name: cephfs_metadata
        application: cephfs
@ -214,6 +249,7 @@ conf:
        application: rgw
        replication: 3
        percent_total_data: 34.8
+
  ceph:
    global:
      # auth
--- a/ceph-osd/templates/bin/osd/_block.sh.tpl
+++ b/ceph-osd/templates/bin/osd/_block.sh.tpl
@ -126,27 +126,40 @@ OSD_PATH="${OSD_PATH_BASE}-${OSD_ID}"
 OSD_KEYRING="${OSD_PATH}/keyring"
 # NOTE(supamatt): set the initial crush weight of the OSD to 0 to prevent automatic rebalancing
 OSD_WEIGHT=0
-if [ "x${CRUSH_RULE}" == "xrack_replicated_rule" ]; then
-  RACK_LOCATION=$(echo rack_$(echo ${HOSTNAME} | cut -c ${RACK_REGEX}))
-  CRUSH_LOCATION=$(echo "root=default rack=${RACK_LOCATION} host=${HOSTNAME}")
+function crush_create_or_move {
+  local crush_location=${1}
  ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
-    osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${CRUSH_LOCATION} || true
-  RACK_LOCATION_CHECK=$(ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" osd find ${OSD_ID} | awk -F'"' '/rack/{print $4}')
-  if [ "x${RACK_LOCATION_CHECK}" != x${RACK_LOCATION} ];  then
+    osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${crush_location} || true
+}
+function crush_add_and_move {
+  local crush_failure_domain_type=${1}
+  local crush_failure_domain_name=${2}
+  local crush_location=$(echo "root=default ${crush_failure_domain_type}=${crush_failure_domain_name} host=${HOSTNAME}")
+  crush_create_or_move "${crush_location}"
+  local crush_failure_domain_location_check=$(ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" osd find ${OSD_ID} | grep "${crush_failure_domain_type}" | awk -F '"' '{print $4}')
+  if [ "x${crush_failure_domain_location_check}" != "x${crush_failure_domain_name}" ];  then
    # NOTE(supamatt): Manually move the buckets for previously configured CRUSH configurations
    # as create-or-move may not appropiately move them.
    ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
-      osd crush add-bucket ${RACK_LOCATION} rack || true
+      osd crush add-bucket "${crush_failure_domain_name}" "${crush_failure_domain_type}" || true
    ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
-      osd crush move ${RACK_LOCATION} root=default || true
+      osd crush move "${crush_failure_domain_name}" root=default || true
    ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
-      osd crush move ${HOSTNAME} rack=${RACK_LOCATION} || true
+      osd crush move "${HOSTNAME}" "${crush_failure_domain_type}=${crush_failure_domain_name}" || true
+  fi
+}
+if [ "x${CRUSH_FAILURE_DOMAIN_TYPE}" != "host" ]; then
+  if [ "x${CRUSH_FAILURE_DOMAIN_NAME}" != "xfalse" ]; then
+    crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "${CRUSH_FAILURE_DOMAIN_NAME}"
+  elif [ "x${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}" != "xfalse" ]; then
+    crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "$(echo ${CRUSH_FAILURE_DOMAIN_TYPE}_$(echo ${HOSTNAME} | cut -c ${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}))"
+  else
+    # NOTE(supamatt): neither variables are defined then we fall back to expected default behavior
+    crush_create_or_move "${CRUSH_LOCATION}"
  fi
 else
-  ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
-    osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${CRUSH_LOCATION} || true
+  crush_create_or_move "${CRUSH_LOCATION}"
 fi
-
 if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then
  if [ -n "${OSD_JOURNAL}" ]; then
    if [ -b "${OSD_JOURNAL}" ]; then
--- a/ceph-osd/templates/bin/osd/_directory.sh.tpl
+++ b/ceph-osd/templates/bin/osd/_directory.sh.tpl
@ -73,7 +73,40 @@ if [[ -n "$(find /var/lib/ceph/osd -prune -empty)" ]]; then
  # add the osd to the crush map
  # NOTE(supamatt): set the initial crush weight of the OSD to 0 to prevent automatic rebalancing
  OSD_WEIGHT=0
-  ceph --name=osd.${OSD_ID} --keyring=${OSD_KEYRING} osd crush create-or-move -- ${OSD_ID} ${OSD_WEIGHT} ${CRUSH_LOCATION}
+  function crush_create_or_move {
+    local crush_location=${1}
+    ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
+      osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${crush_location} || true
+  }
+  function crush_add_and_move {
+    local crush_failure_domain_type=${1}
+    local crush_failure_domain_name=${2}
+    local crush_location=$(echo "root=default ${crush_failure_domain_type}=${crush_failure_domain_name} host=${HOSTNAME}")
+    crush_create_or_move "${crush_location}"
+    local crush_failure_domain_location_check=$(ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" osd find ${OSD_ID} | grep "${crush_failure_domain_type}" | awk -F '"' '{print $4}')
+    if [ "x${crush_failure_domain_location_check}" != "x${crush_failure_domain_name}" ];  then
+      # NOTE(supamatt): Manually move the buckets for previously configured CRUSH configurations
+      # as create-or-move may not appropiately move them.
+      ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
+        osd crush add-bucket "${crush_failure_domain_name}" "${crush_failure_domain_type}" || true
+      ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
+        osd crush move "${crush_failure_domain_name}" root=default || true
+      ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
+        osd crush move "${HOSTNAME}" "${crush_failure_domain_type}=${crush_failure_domain_name}" || true
+    fi
+  }
+  if [ "x${CRUSH_FAILURE_DOMAIN_TYPE}" != "host" ]; then
+    if [ "x${CRUSH_FAILURE_DOMAIN_NAME}" != "xfalse" ]; then
+      crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "${CRUSH_FAILURE_DOMAIN_NAME}"
+    elif [ "x${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}" != "xfalse" ]; then
+      crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "$(echo ${CRUSH_FAILURE_DOMAIN_TYPE}_$(echo ${HOSTNAME} | cut -c ${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}))"
+    else
+      # NOTE(supamatt): neither variables are defined then we fall back to default behavior
+      crush_create_or_move "${CRUSH_LOCATION}"
+    fi
+  else
+    crush_create_or_move "${CRUSH_LOCATION}"
+  fi
 fi

 # create the directory and an empty Procfile
--- a/ceph-osd/templates/daemonset-osd.yaml
+++ b/ceph-osd/templates/daemonset-osd.yaml
@ -179,10 +179,12 @@ spec:
              value: "ceph"
            - name: CEPH_GET_ADMIN_KEY
              value: "1"
-            - name: CRUSH_RULE
-              value: {{ .Values.conf.pool.default.crush_rule }}
-            - name: RACK_REGEX
-              value: {{ .Values.conf.pool.default.rack_regex }}
+            - name: CRUSH_FAILURE_DOMAIN_TYPE
+              value: {{ .Values.conf.storage.failure_domain | default "host" | quote }}
+            - name: CRUSH_FAILURE_DOMAIN_NAME
+              value: {{ .Values.conf.storage.failure_domain_name | default "false" | quote }}
+            - name: CRUSH_FAILURE_DOMAIN_BY_HOSTNAME
+              value: {{ .Values.conf.storage.failure_domain_by_hostname | default "false" | quote }}
          command:
            - /tmp/osd-start.sh
          lifecycle:
--- a/ceph-osd/values.yaml
+++ b/ceph-osd/values.yaml
@ -106,20 +106,21 @@ conf:
      osd_recovery_max_active: 1
      osd_mount_options_xfs: "rw,noatime,largeio,inode64,swalloc,logbufs=8,logbsize=256k,allocsize=4M"
      osd_journal_size: 10240
-
-  pool:
-    default:
-      # NOTE(supamatt): Accepted values are:
-      # same_host for a single node
-      # replicated_rule for a multi node
-      # rack_replicated_rule for a multi node in multiple (>=3) racks
-      # Ceph cluster must be in a healthy state.
-      crush_rule: replicated_rule
-      # NOTE(supamatt): By default use the first 8 characters of the hostname to
-      # define the the rack type bucket names for CRUSH.
-      rack_regex: "1-8"
+      osd_crush_update_on_start: false

  storage:
+    # NOTE(supamatt): By default use host based buckets for failure domains. Any `failure_domain` defined must
+    # match the failure domain used on your CRUSH rules for pools. For example with a crush rule of
+    # rack_replicated_rule you would specify "rack" as the `failure_domain` to use.
+    # `failure_domain`: Set the CRUSH bucket type for your OSD to reside in. See the supported CRUSH configuration
+    #  as listed here: Supported CRUSH configuration is listed here: http://docs.ceph.com/docs/luminous/rados/operations/crush-map/
+    # `failure_domain_by_hostname`: Specify the portion of the hostname to use for your failure domain bucket name.
+    # `failure_domain_name`: Manually name the failure domain bucket name. This configuration option should only be used
+    #  when using host based overrides.
+    # failure_domain: "rack"
+    # failure_domain_by_hostname: 1-8
+    # failure_domain_name: false
+
    # NOTE(portdirect): for homogeneous clusters the `osd` key can be used to
    # define OSD pods that will be deployed across the cluster.
    osd:
@ -149,6 +150,7 @@ conf:
 #       - name: host1.fqdn
 #         conf:
 #           storage:
+#             failure_domain_name: "rack1"
 #             osd:
 #               - data:
 #                   type: directory