Add failure domains, and device classes for custom CRUSH rules

Largely inspired and taken from Kranthi's PS.

 - Add support for creating custom CRUSH rules based off of failure
domains and device classes (ssd & hdd)
- Basic logic around the PG calculator to autodetect the number of
OSDs globally and per device class (required when using custom crush
rules that specify device classes).

Change-Id: I13a6f5eb21494746c2b77e340e8d0dcb0d81a591
This commit is contained in:
Matthew Heler 2018-11-23 14:59:37 -06:00
parent 8e369d2c9c
commit 6e8c289c13
6 changed files with 151 additions and 52 deletions

View File

@ -20,8 +20,6 @@ set -ex
export LC_ALL=C
: "${ADMIN_KEYRING:=/etc/ceph/${CLUSTER}.client.admin.keyring}"
: "${OSD_TARGET_PGS:=100}"
: "${QUANTITY_OSDS:=15}"
if [[ ! -e /etc/ceph/${CLUSTER}.conf ]]; then
echo "ERROR- /etc/ceph/${CLUSTER}.conf must exist; get it from your existing mon"
@ -33,13 +31,21 @@ if [[ ! -e ${ADMIN_KEYRING} ]]; then
exit 1
fi
if ! ceph --cluster "${CLUSTER}" osd crush rule ls | grep -q "^same_host$"; then
ceph --cluster "${CLUSTER}" osd crush rule create-simple same_host default osd
function create_crushrule () {
CRUSH_NAME=$1
CRUSH_RULE=$2
CRUSH_FAILURE_DOMAIN=$3
CRUSH_DEVICE_CLASS=$4
if ! ceph --cluster "${CLUSTER}" osd crush rule ls | grep -q "^\$CRUSH_NAME$"; then
ceph --cluster "${CLUSTER}" osd crush rule $CRUSH_RULE $CRUSH_NAME default $CRUSH_FAILURE_DOMAIN $CRUSH_DEVICE_CLASS || true
fi
}
if ! ceph --cluster "${CLUSTER}" osd crush rule ls | grep -q "^rack_replicated_rule$"; then
ceph --cluster "${CLUSTER}" osd crush rule create-simple rack_replicated_rule default rack
fi
{{- range $crush_rule := .Values.conf.pool.crush_rules -}}
{{- with $crush_rule }}
create_crushrule {{ .name }} {{ .crush_rule }} {{ .failure_domain }} {{ .device_class }}
{{- end }}
{{- end }}
function reweight_osds () {
for OSD_ID in $(ceph --cluster "${CLUSTER}" osd df | awk '$3 == "0" {print $1}'); do
@ -105,28 +111,35 @@ function manage_pool () {
POOL_APPLICATION=$1
POOL_NAME=$2
POOL_REPLICATION=$3
TOTAL_OSDS=$4
TOTAL_DATA_PERCENT=$5
TARGET_PG_PER_OSD=$6
POOL_CRUSH_RULE=$7
POOL_PROTECTION=$8
TOTAL_DATA_PERCENT=$4
TARGET_PG_PER_OSD=$5
POOL_CRUSH_RULE=$6
POOL_PROTECTION=$7
TOTAL_OSDS=$(ceph --cluster "${CLUSTER}" osd ls | wc -l)
if (ceph --cluster "${CLUSTER}" osd crush rule dump "${POOL_CRUSH_RULE}" | awk '/item_name/' | grep -q ssd); then
TOTAL_OSDS=$(ceph --cluster "${CLUSTER}" osd tree | grep "ssd" | wc -l)
elif (ceph --cluster "${CLUSTER}" osd crush rule dump "${POOL_CRUSH_RULE}" | awk '/item_name/' | grep -q hdd); then
TOTAL_OSDS=$(ceph --cluster "${CLUSTER}" osd tree | grep "hdd" | wc -l)
fi
POOL_PLACEMENT_GROUPS=$(/tmp/pool-calc.py ${POOL_REPLICATION} ${TOTAL_OSDS} ${TOTAL_DATA_PERCENT} ${TARGET_PG_PER_OSD})
create_pool "${POOL_APPLICATION}" "${POOL_NAME}" "${POOL_REPLICATION}" "${POOL_PLACEMENT_GROUPS}" "${POOL_CRUSH_RULE}" "${POOL_PROTECTION}"
}
reweight_osds
{{ $targetNumOSD := .Values.conf.pool.target.osd }}
{{ $targetPGperOSD := .Values.conf.pool.target.pg_per_osd }}
{{ $crushRuleDefault := .Values.conf.pool.default.crush_rule }}
{{ $targetProtection := .Values.conf.pool.target.protected | default "false" | quote | lower }}
{{- range $pool := .Values.conf.pool.spec -}}
{{- with $pool }}
manage_pool {{ .application }} {{ .name }} {{ .replication }} {{ $targetNumOSD }} {{ .percent_total_data }} {{ $targetPGperOSD }} {{ $crushRuleDefault }} {{ $targetProtection }}
{{- if .crush_rule }}
manage_pool {{ .application }} {{ .name }} {{ .replication }} {{ .percent_total_data }} {{ $targetPGperOSD }} {{ .crush_rule }} {{ $targetProtection }}
{{ else }}
manage_pool {{ .application }} {{ .name }} {{ .replication }} {{ .percent_total_data }} {{ $targetPGperOSD }} {{ $crushRuleDefault }} {{ $targetProtection }}
{{- end }}
{{- end }}
{{- end }}
{{- if .Values.conf.pool.crush.tunables }}
ceph --cluster "${CLUSTER}" osd crush tunables {{ .Values.conf.pool.crush.tunables }}
{{- end }}

View File

@ -123,17 +123,48 @@ conf:
tunables: null
target:
#NOTE(portdirect): arbitrarily we set the default number of expected OSD's to 5
# to match the number of nodes in the OSH gate.
# to match the number of nodes in the OSH gate (used only for helm tests).
osd: 5
pg_per_osd: 100
protected: true
default:
# NOTE(supamatt): Accepted values are:
# same_host for a single node
# replicated_rule for a multi node
# rack_replicated_rule for a multi node in multiple (>=3) racks
# Ceph cluster must be in a healthy state.
# NOTE(supamatt): Accepted values are taken from `crush_rules` list.
crush_rule: replicated_rule
crush_rules:
# NOTE(supamatt): Device classes must remain undefined if all OSDs are the
# same device type of backing disks (ie, all HDD or all SDD).
- name: same_host
crush_rule: create-simple
failure_domain: osd
device_class:
- name: replicated_rule
crush_rule: create-simple
failure_domain: host
device_class:
- name: rack_replicated_rule
crush_rule: create-simple
failure_domain: rack
device_class:
# - name: replicated_rule-ssd
# crush_rule: create-replicated
# failure_domain: host
# device_class: sdd
# - name: replicated_rule-hdd
# crush_rule: create-replicated
# failure_domain: host
# device_class: hdd
# - name: rack_replicated_rule-ssd
# crush_rule: create-replicated
# failure_domain: rack
# device_class: ssd
# - name: rack_replicated_rule-hdd
# crush_rule: create-replicated
# failure_domain: rack
# device_class: hdd
# - name: row_replicated_rule
# crush_rule: create-simple
# failure_domain: row
# device_class:
# NOTE(portdirect): this section describes the pools that will be managed by
# the ceph pool management job, as it tunes the pgs and crush rule, based on
@ -144,6 +175,10 @@ conf:
application: rbd
replication: 3
percent_total_data: 40
# NOTE(supamatt): By default the crush rules used to create each pool will be
# taken from the pool default `crush_rule` unless a pool specific `crush_rule`
# is specified. The rule MUST exist for it to be defined here.
# crush_rule: replicated_rule
# CephFS pools
- name: cephfs_metadata
application: cephfs
@ -214,6 +249,7 @@ conf:
application: rgw
replication: 3
percent_total_data: 34.8
ceph:
global:
# auth

View File

@ -126,27 +126,40 @@ OSD_PATH="${OSD_PATH_BASE}-${OSD_ID}"
OSD_KEYRING="${OSD_PATH}/keyring"
# NOTE(supamatt): set the initial crush weight of the OSD to 0 to prevent automatic rebalancing
OSD_WEIGHT=0
if [ "x${CRUSH_RULE}" == "xrack_replicated_rule" ]; then
RACK_LOCATION=$(echo rack_$(echo ${HOSTNAME} | cut -c ${RACK_REGEX}))
CRUSH_LOCATION=$(echo "root=default rack=${RACK_LOCATION} host=${HOSTNAME}")
function crush_create_or_move {
local crush_location=${1}
ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${CRUSH_LOCATION} || true
RACK_LOCATION_CHECK=$(ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" osd find ${OSD_ID} | awk -F'"' '/rack/{print $4}')
if [ "x${RACK_LOCATION_CHECK}" != x${RACK_LOCATION} ]; then
osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${crush_location} || true
}
function crush_add_and_move {
local crush_failure_domain_type=${1}
local crush_failure_domain_name=${2}
local crush_location=$(echo "root=default ${crush_failure_domain_type}=${crush_failure_domain_name} host=${HOSTNAME}")
crush_create_or_move "${crush_location}"
local crush_failure_domain_location_check=$(ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" osd find ${OSD_ID} | grep "${crush_failure_domain_type}" | awk -F '"' '{print $4}')
if [ "x${crush_failure_domain_location_check}" != "x${crush_failure_domain_name}" ]; then
# NOTE(supamatt): Manually move the buckets for previously configured CRUSH configurations
# as create-or-move may not appropiately move them.
ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
osd crush add-bucket ${RACK_LOCATION} rack || true
osd crush add-bucket "${crush_failure_domain_name}" "${crush_failure_domain_type}" || true
ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
osd crush move ${RACK_LOCATION} root=default || true
osd crush move "${crush_failure_domain_name}" root=default || true
ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
osd crush move ${HOSTNAME} rack=${RACK_LOCATION} || true
osd crush move "${HOSTNAME}" "${crush_failure_domain_type}=${crush_failure_domain_name}" || true
fi
}
if [ "x${CRUSH_FAILURE_DOMAIN_TYPE}" != "host" ]; then
if [ "x${CRUSH_FAILURE_DOMAIN_NAME}" != "xfalse" ]; then
crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "${CRUSH_FAILURE_DOMAIN_NAME}"
elif [ "x${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}" != "xfalse" ]; then
crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "$(echo ${CRUSH_FAILURE_DOMAIN_TYPE}_$(echo ${HOSTNAME} | cut -c ${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}))"
else
# NOTE(supamatt): neither variables are defined then we fall back to expected default behavior
crush_create_or_move "${CRUSH_LOCATION}"
fi
else
ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${CRUSH_LOCATION} || true
crush_create_or_move "${CRUSH_LOCATION}"
fi
if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then
if [ -n "${OSD_JOURNAL}" ]; then
if [ -b "${OSD_JOURNAL}" ]; then

View File

@ -73,7 +73,40 @@ if [[ -n "$(find /var/lib/ceph/osd -prune -empty)" ]]; then
# add the osd to the crush map
# NOTE(supamatt): set the initial crush weight of the OSD to 0 to prevent automatic rebalancing
OSD_WEIGHT=0
ceph --name=osd.${OSD_ID} --keyring=${OSD_KEYRING} osd crush create-or-move -- ${OSD_ID} ${OSD_WEIGHT} ${CRUSH_LOCATION}
function crush_create_or_move {
local crush_location=${1}
ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${crush_location} || true
}
function crush_add_and_move {
local crush_failure_domain_type=${1}
local crush_failure_domain_name=${2}
local crush_location=$(echo "root=default ${crush_failure_domain_type}=${crush_failure_domain_name} host=${HOSTNAME}")
crush_create_or_move "${crush_location}"
local crush_failure_domain_location_check=$(ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" osd find ${OSD_ID} | grep "${crush_failure_domain_type}" | awk -F '"' '{print $4}')
if [ "x${crush_failure_domain_location_check}" != "x${crush_failure_domain_name}" ]; then
# NOTE(supamatt): Manually move the buckets for previously configured CRUSH configurations
# as create-or-move may not appropiately move them.
ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
osd crush add-bucket "${crush_failure_domain_name}" "${crush_failure_domain_type}" || true
ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
osd crush move "${crush_failure_domain_name}" root=default || true
ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
osd crush move "${HOSTNAME}" "${crush_failure_domain_type}=${crush_failure_domain_name}" || true
fi
}
if [ "x${CRUSH_FAILURE_DOMAIN_TYPE}" != "host" ]; then
if [ "x${CRUSH_FAILURE_DOMAIN_NAME}" != "xfalse" ]; then
crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "${CRUSH_FAILURE_DOMAIN_NAME}"
elif [ "x${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}" != "xfalse" ]; then
crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "$(echo ${CRUSH_FAILURE_DOMAIN_TYPE}_$(echo ${HOSTNAME} | cut -c ${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}))"
else
# NOTE(supamatt): neither variables are defined then we fall back to default behavior
crush_create_or_move "${CRUSH_LOCATION}"
fi
else
crush_create_or_move "${CRUSH_LOCATION}"
fi
fi
# create the directory and an empty Procfile

View File

@ -179,10 +179,12 @@ spec:
value: "ceph"
- name: CEPH_GET_ADMIN_KEY
value: "1"
- name: CRUSH_RULE
value: {{ .Values.conf.pool.default.crush_rule }}
- name: RACK_REGEX
value: {{ .Values.conf.pool.default.rack_regex }}
- name: CRUSH_FAILURE_DOMAIN_TYPE
value: {{ .Values.conf.storage.failure_domain | default "host" | quote }}
- name: CRUSH_FAILURE_DOMAIN_NAME
value: {{ .Values.conf.storage.failure_domain_name | default "false" | quote }}
- name: CRUSH_FAILURE_DOMAIN_BY_HOSTNAME
value: {{ .Values.conf.storage.failure_domain_by_hostname | default "false" | quote }}
command:
- /tmp/osd-start.sh
lifecycle:

View File

@ -106,20 +106,21 @@ conf:
osd_recovery_max_active: 1
osd_mount_options_xfs: "rw,noatime,largeio,inode64,swalloc,logbufs=8,logbsize=256k,allocsize=4M"
osd_journal_size: 10240
pool:
default:
# NOTE(supamatt): Accepted values are:
# same_host for a single node
# replicated_rule for a multi node
# rack_replicated_rule for a multi node in multiple (>=3) racks
# Ceph cluster must be in a healthy state.
crush_rule: replicated_rule
# NOTE(supamatt): By default use the first 8 characters of the hostname to
# define the the rack type bucket names for CRUSH.
rack_regex: "1-8"
osd_crush_update_on_start: false
storage:
# NOTE(supamatt): By default use host based buckets for failure domains. Any `failure_domain` defined must
# match the failure domain used on your CRUSH rules for pools. For example with a crush rule of
# rack_replicated_rule you would specify "rack" as the `failure_domain` to use.
# `failure_domain`: Set the CRUSH bucket type for your OSD to reside in. See the supported CRUSH configuration
# as listed here: Supported CRUSH configuration is listed here: http://docs.ceph.com/docs/luminous/rados/operations/crush-map/
# `failure_domain_by_hostname`: Specify the portion of the hostname to use for your failure domain bucket name.
# `failure_domain_name`: Manually name the failure domain bucket name. This configuration option should only be used
# when using host based overrides.
# failure_domain: "rack"
# failure_domain_by_hostname: 1-8
# failure_domain_name: false
# NOTE(portdirect): for homogeneous clusters the `osd` key can be used to
# define OSD pods that will be deployed across the cluster.
osd:
@ -149,6 +150,7 @@ conf:
# - name: host1.fqdn
# conf:
# storage:
# failure_domain_name: "rack1"
# osd:
# - data:
# type: directory