diff --git a/ceph-client/templates/bin/pool/_init.sh.tpl b/ceph-client/templates/bin/pool/_init.sh.tpl index 98d9f9fcb..8212c0034 100644 --- a/ceph-client/templates/bin/pool/_init.sh.tpl +++ b/ceph-client/templates/bin/pool/_init.sh.tpl @@ -20,8 +20,6 @@ set -ex export LC_ALL=C : "${ADMIN_KEYRING:=/etc/ceph/${CLUSTER}.client.admin.keyring}" -: "${OSD_TARGET_PGS:=100}" -: "${QUANTITY_OSDS:=15}" if [[ ! -e /etc/ceph/${CLUSTER}.conf ]]; then echo "ERROR- /etc/ceph/${CLUSTER}.conf must exist; get it from your existing mon" @@ -33,13 +31,21 @@ if [[ ! -e ${ADMIN_KEYRING} ]]; then exit 1 fi -if ! ceph --cluster "${CLUSTER}" osd crush rule ls | grep -q "^same_host$"; then - ceph --cluster "${CLUSTER}" osd crush rule create-simple same_host default osd -fi +function create_crushrule () { + CRUSH_NAME=$1 + CRUSH_RULE=$2 + CRUSH_FAILURE_DOMAIN=$3 + CRUSH_DEVICE_CLASS=$4 + if ! ceph --cluster "${CLUSTER}" osd crush rule ls | grep -q "^\$CRUSH_NAME$"; then + ceph --cluster "${CLUSTER}" osd crush rule $CRUSH_RULE $CRUSH_NAME default $CRUSH_FAILURE_DOMAIN $CRUSH_DEVICE_CLASS || true + fi +} -if ! ceph --cluster "${CLUSTER}" osd crush rule ls | grep -q "^rack_replicated_rule$"; then - ceph --cluster "${CLUSTER}" osd crush rule create-simple rack_replicated_rule default rack -fi +{{- range $crush_rule := .Values.conf.pool.crush_rules -}} +{{- with $crush_rule }} +create_crushrule {{ .name }} {{ .crush_rule }} {{ .failure_domain }} {{ .device_class }} +{{- end }} +{{- end }} function reweight_osds () { for OSD_ID in $(ceph --cluster "${CLUSTER}" osd df | awk '$3 == "0" {print $1}'); do @@ -105,28 +111,35 @@ function manage_pool () { POOL_APPLICATION=$1 POOL_NAME=$2 POOL_REPLICATION=$3 - TOTAL_OSDS=$4 - TOTAL_DATA_PERCENT=$5 - TARGET_PG_PER_OSD=$6 - POOL_CRUSH_RULE=$7 - POOL_PROTECTION=$8 + TOTAL_DATA_PERCENT=$4 + TARGET_PG_PER_OSD=$5 + POOL_CRUSH_RULE=$6 + POOL_PROTECTION=$7 + TOTAL_OSDS=$(ceph --cluster "${CLUSTER}" osd ls | wc -l) + if (ceph --cluster "${CLUSTER}" osd crush rule dump "${POOL_CRUSH_RULE}" | awk '/item_name/' | grep -q ssd); then + TOTAL_OSDS=$(ceph --cluster "${CLUSTER}" osd tree | grep "ssd" | wc -l) + elif (ceph --cluster "${CLUSTER}" osd crush rule dump "${POOL_CRUSH_RULE}" | awk '/item_name/' | grep -q hdd); then + TOTAL_OSDS=$(ceph --cluster "${CLUSTER}" osd tree | grep "hdd" | wc -l) + fi POOL_PLACEMENT_GROUPS=$(/tmp/pool-calc.py ${POOL_REPLICATION} ${TOTAL_OSDS} ${TOTAL_DATA_PERCENT} ${TARGET_PG_PER_OSD}) create_pool "${POOL_APPLICATION}" "${POOL_NAME}" "${POOL_REPLICATION}" "${POOL_PLACEMENT_GROUPS}" "${POOL_CRUSH_RULE}" "${POOL_PROTECTION}" } reweight_osds -{{ $targetNumOSD := .Values.conf.pool.target.osd }} {{ $targetPGperOSD := .Values.conf.pool.target.pg_per_osd }} {{ $crushRuleDefault := .Values.conf.pool.default.crush_rule }} {{ $targetProtection := .Values.conf.pool.target.protected | default "false" | quote | lower }} {{- range $pool := .Values.conf.pool.spec -}} {{- with $pool }} -manage_pool {{ .application }} {{ .name }} {{ .replication }} {{ $targetNumOSD }} {{ .percent_total_data }} {{ $targetPGperOSD }} {{ $crushRuleDefault }} {{ $targetProtection }} +{{- if .crush_rule }} +manage_pool {{ .application }} {{ .name }} {{ .replication }} {{ .percent_total_data }} {{ $targetPGperOSD }} {{ .crush_rule }} {{ $targetProtection }} +{{ else }} +manage_pool {{ .application }} {{ .name }} {{ .replication }} {{ .percent_total_data }} {{ $targetPGperOSD }} {{ $crushRuleDefault }} {{ $targetProtection }} +{{- end }} {{- end }} {{- end }} {{- if .Values.conf.pool.crush.tunables }} ceph --cluster "${CLUSTER}" osd crush tunables {{ .Values.conf.pool.crush.tunables }} {{- end }} - diff --git a/ceph-client/values.yaml b/ceph-client/values.yaml index 3cb2c976f..d80634e76 100644 --- a/ceph-client/values.yaml +++ b/ceph-client/values.yaml @@ -123,19 +123,50 @@ conf: tunables: null target: #NOTE(portdirect): arbitrarily we set the default number of expected OSD's to 5 - # to match the number of nodes in the OSH gate. + # to match the number of nodes in the OSH gate (used only for helm tests). osd: 5 pg_per_osd: 100 protected: true default: - # NOTE(supamatt): Accepted values are: - # same_host for a single node - # replicated_rule for a multi node - # rack_replicated_rule for a multi node in multiple (>=3) racks - # Ceph cluster must be in a healthy state. + # NOTE(supamatt): Accepted values are taken from `crush_rules` list. crush_rule: replicated_rule + crush_rules: + # NOTE(supamatt): Device classes must remain undefined if all OSDs are the + # same device type of backing disks (ie, all HDD or all SDD). + - name: same_host + crush_rule: create-simple + failure_domain: osd + device_class: + - name: replicated_rule + crush_rule: create-simple + failure_domain: host + device_class: + - name: rack_replicated_rule + crush_rule: create-simple + failure_domain: rack + device_class: + # - name: replicated_rule-ssd + # crush_rule: create-replicated + # failure_domain: host + # device_class: sdd + # - name: replicated_rule-hdd + # crush_rule: create-replicated + # failure_domain: host + # device_class: hdd + # - name: rack_replicated_rule-ssd + # crush_rule: create-replicated + # failure_domain: rack + # device_class: ssd + # - name: rack_replicated_rule-hdd + # crush_rule: create-replicated + # failure_domain: rack + # device_class: hdd + # - name: row_replicated_rule + # crush_rule: create-simple + # failure_domain: row + # device_class: - #NOTE(portdirect): this section describes the pools that will be managed by + # NOTE(portdirect): this section describes the pools that will be managed by # the ceph pool management job, as it tunes the pgs and crush rule, based on # the above. spec: @@ -144,6 +175,10 @@ conf: application: rbd replication: 3 percent_total_data: 40 + # NOTE(supamatt): By default the crush rules used to create each pool will be + # taken from the pool default `crush_rule` unless a pool specific `crush_rule` + # is specified. The rule MUST exist for it to be defined here. + # crush_rule: replicated_rule # CephFS pools - name: cephfs_metadata application: cephfs @@ -214,6 +249,7 @@ conf: application: rgw replication: 3 percent_total_data: 34.8 + ceph: global: # auth diff --git a/ceph-osd/templates/bin/osd/_block.sh.tpl b/ceph-osd/templates/bin/osd/_block.sh.tpl index a86b8059d..3c19613f6 100644 --- a/ceph-osd/templates/bin/osd/_block.sh.tpl +++ b/ceph-osd/templates/bin/osd/_block.sh.tpl @@ -126,27 +126,40 @@ OSD_PATH="${OSD_PATH_BASE}-${OSD_ID}" OSD_KEYRING="${OSD_PATH}/keyring" # NOTE(supamatt): set the initial crush weight of the OSD to 0 to prevent automatic rebalancing OSD_WEIGHT=0 -if [ "x${CRUSH_RULE}" == "xrack_replicated_rule" ]; then - RACK_LOCATION=$(echo rack_$(echo ${HOSTNAME} | cut -c ${RACK_REGEX})) - CRUSH_LOCATION=$(echo "root=default rack=${RACK_LOCATION} host=${HOSTNAME}") +function crush_create_or_move { + local crush_location=${1} ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ - osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${CRUSH_LOCATION} || true - RACK_LOCATION_CHECK=$(ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" osd find ${OSD_ID} | awk -F'"' '/rack/{print $4}') - if [ "x${RACK_LOCATION_CHECK}" != x${RACK_LOCATION} ]; then + osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${crush_location} || true +} +function crush_add_and_move { + local crush_failure_domain_type=${1} + local crush_failure_domain_name=${2} + local crush_location=$(echo "root=default ${crush_failure_domain_type}=${crush_failure_domain_name} host=${HOSTNAME}") + crush_create_or_move "${crush_location}" + local crush_failure_domain_location_check=$(ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" osd find ${OSD_ID} | grep "${crush_failure_domain_type}" | awk -F '"' '{print $4}') + if [ "x${crush_failure_domain_location_check}" != "x${crush_failure_domain_name}" ]; then # NOTE(supamatt): Manually move the buckets for previously configured CRUSH configurations # as create-or-move may not appropiately move them. ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ - osd crush add-bucket ${RACK_LOCATION} rack || true + osd crush add-bucket "${crush_failure_domain_name}" "${crush_failure_domain_type}" || true ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ - osd crush move ${RACK_LOCATION} root=default || true + osd crush move "${crush_failure_domain_name}" root=default || true ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ - osd crush move ${HOSTNAME} rack=${RACK_LOCATION} || true + osd crush move "${HOSTNAME}" "${crush_failure_domain_type}=${crush_failure_domain_name}" || true + fi +} +if [ "x${CRUSH_FAILURE_DOMAIN_TYPE}" != "host" ]; then + if [ "x${CRUSH_FAILURE_DOMAIN_NAME}" != "xfalse" ]; then + crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "${CRUSH_FAILURE_DOMAIN_NAME}" + elif [ "x${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}" != "xfalse" ]; then + crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "$(echo ${CRUSH_FAILURE_DOMAIN_TYPE}_$(echo ${HOSTNAME} | cut -c ${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}))" + else + # NOTE(supamatt): neither variables are defined then we fall back to expected default behavior + crush_create_or_move "${CRUSH_LOCATION}" fi else - ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ - osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${CRUSH_LOCATION} || true + crush_create_or_move "${CRUSH_LOCATION}" fi - if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then if [ -n "${OSD_JOURNAL}" ]; then if [ -b "${OSD_JOURNAL}" ]; then diff --git a/ceph-osd/templates/bin/osd/_directory.sh.tpl b/ceph-osd/templates/bin/osd/_directory.sh.tpl index 4645ce0e2..94d973302 100644 --- a/ceph-osd/templates/bin/osd/_directory.sh.tpl +++ b/ceph-osd/templates/bin/osd/_directory.sh.tpl @@ -73,7 +73,40 @@ if [[ -n "$(find /var/lib/ceph/osd -prune -empty)" ]]; then # add the osd to the crush map # NOTE(supamatt): set the initial crush weight of the OSD to 0 to prevent automatic rebalancing OSD_WEIGHT=0 - ceph --name=osd.${OSD_ID} --keyring=${OSD_KEYRING} osd crush create-or-move -- ${OSD_ID} ${OSD_WEIGHT} ${CRUSH_LOCATION} + function crush_create_or_move { + local crush_location=${1} + ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ + osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${crush_location} || true + } + function crush_add_and_move { + local crush_failure_domain_type=${1} + local crush_failure_domain_name=${2} + local crush_location=$(echo "root=default ${crush_failure_domain_type}=${crush_failure_domain_name} host=${HOSTNAME}") + crush_create_or_move "${crush_location}" + local crush_failure_domain_location_check=$(ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" osd find ${OSD_ID} | grep "${crush_failure_domain_type}" | awk -F '"' '{print $4}') + if [ "x${crush_failure_domain_location_check}" != "x${crush_failure_domain_name}" ]; then + # NOTE(supamatt): Manually move the buckets for previously configured CRUSH configurations + # as create-or-move may not appropiately move them. + ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ + osd crush add-bucket "${crush_failure_domain_name}" "${crush_failure_domain_type}" || true + ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ + osd crush move "${crush_failure_domain_name}" root=default || true + ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ + osd crush move "${HOSTNAME}" "${crush_failure_domain_type}=${crush_failure_domain_name}" || true + fi + } + if [ "x${CRUSH_FAILURE_DOMAIN_TYPE}" != "host" ]; then + if [ "x${CRUSH_FAILURE_DOMAIN_NAME}" != "xfalse" ]; then + crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "${CRUSH_FAILURE_DOMAIN_NAME}" + elif [ "x${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}" != "xfalse" ]; then + crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "$(echo ${CRUSH_FAILURE_DOMAIN_TYPE}_$(echo ${HOSTNAME} | cut -c ${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}))" + else + # NOTE(supamatt): neither variables are defined then we fall back to default behavior + crush_create_or_move "${CRUSH_LOCATION}" + fi + else + crush_create_or_move "${CRUSH_LOCATION}" + fi fi # create the directory and an empty Procfile diff --git a/ceph-osd/templates/daemonset-osd.yaml b/ceph-osd/templates/daemonset-osd.yaml index 7c26825d6..2169d45fe 100644 --- a/ceph-osd/templates/daemonset-osd.yaml +++ b/ceph-osd/templates/daemonset-osd.yaml @@ -179,10 +179,12 @@ spec: value: "ceph" - name: CEPH_GET_ADMIN_KEY value: "1" - - name: CRUSH_RULE - value: {{ .Values.conf.pool.default.crush_rule }} - - name: RACK_REGEX - value: {{ .Values.conf.pool.default.rack_regex }} + - name: CRUSH_FAILURE_DOMAIN_TYPE + value: {{ .Values.conf.storage.failure_domain | default "host" | quote }} + - name: CRUSH_FAILURE_DOMAIN_NAME + value: {{ .Values.conf.storage.failure_domain_name | default "false" | quote }} + - name: CRUSH_FAILURE_DOMAIN_BY_HOSTNAME + value: {{ .Values.conf.storage.failure_domain_by_hostname | default "false" | quote }} command: - /tmp/osd-start.sh lifecycle: diff --git a/ceph-osd/values.yaml b/ceph-osd/values.yaml index 7e6b12262..009659313 100644 --- a/ceph-osd/values.yaml +++ b/ceph-osd/values.yaml @@ -106,20 +106,21 @@ conf: osd_recovery_max_active: 1 osd_mount_options_xfs: "rw,noatime,largeio,inode64,swalloc,logbufs=8,logbsize=256k,allocsize=4M" osd_journal_size: 10240 - - pool: - default: - # NOTE(supamatt): Accepted values are: - # same_host for a single node - # replicated_rule for a multi node - # rack_replicated_rule for a multi node in multiple (>=3) racks - # Ceph cluster must be in a healthy state. - crush_rule: replicated_rule - # NOTE(supamatt): By default use the first 8 characters of the hostname to - # define the the rack type bucket names for CRUSH. - rack_regex: "1-8" + osd_crush_update_on_start: false storage: + # NOTE(supamatt): By default use host based buckets for failure domains. Any `failure_domain` defined must + # match the failure domain used on your CRUSH rules for pools. For example with a crush rule of + # rack_replicated_rule you would specify "rack" as the `failure_domain` to use. + # `failure_domain`: Set the CRUSH bucket type for your OSD to reside in. See the supported CRUSH configuration + # as listed here: Supported CRUSH configuration is listed here: http://docs.ceph.com/docs/luminous/rados/operations/crush-map/ + # `failure_domain_by_hostname`: Specify the portion of the hostname to use for your failure domain bucket name. + # `failure_domain_name`: Manually name the failure domain bucket name. This configuration option should only be used + # when using host based overrides. + # failure_domain: "rack" + # failure_domain_by_hostname: 1-8 + # failure_domain_name: false + # NOTE(portdirect): for homogeneous clusters the `osd` key can be used to # define OSD pods that will be deployed across the cluster. osd: @@ -149,6 +150,7 @@ conf: # - name: host1.fqdn # conf: # storage: +# failure_domain_name: "rack1" # osd: # - data: # type: directory