Add failure domains, and device classes for custom CRUSH rules
Largely inspired and taken from Kranthi's PS. - Add support for creating custom CRUSH rules based off of failure domains and device classes (ssd & hdd) - Basic logic around the PG calculator to autodetect the number of OSDs globally and per device class (required when using custom crush rules that specify device classes). Change-Id: I13a6f5eb21494746c2b77e340e8d0dcb0d81a591
This commit is contained in:
parent
8e369d2c9c
commit
6e8c289c13
@ -20,8 +20,6 @@ set -ex
|
||||
export LC_ALL=C
|
||||
|
||||
: "${ADMIN_KEYRING:=/etc/ceph/${CLUSTER}.client.admin.keyring}"
|
||||
: "${OSD_TARGET_PGS:=100}"
|
||||
: "${QUANTITY_OSDS:=15}"
|
||||
|
||||
if [[ ! -e /etc/ceph/${CLUSTER}.conf ]]; then
|
||||
echo "ERROR- /etc/ceph/${CLUSTER}.conf must exist; get it from your existing mon"
|
||||
@ -33,13 +31,21 @@ if [[ ! -e ${ADMIN_KEYRING} ]]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! ceph --cluster "${CLUSTER}" osd crush rule ls | grep -q "^same_host$"; then
|
||||
ceph --cluster "${CLUSTER}" osd crush rule create-simple same_host default osd
|
||||
function create_crushrule () {
|
||||
CRUSH_NAME=$1
|
||||
CRUSH_RULE=$2
|
||||
CRUSH_FAILURE_DOMAIN=$3
|
||||
CRUSH_DEVICE_CLASS=$4
|
||||
if ! ceph --cluster "${CLUSTER}" osd crush rule ls | grep -q "^\$CRUSH_NAME$"; then
|
||||
ceph --cluster "${CLUSTER}" osd crush rule $CRUSH_RULE $CRUSH_NAME default $CRUSH_FAILURE_DOMAIN $CRUSH_DEVICE_CLASS || true
|
||||
fi
|
||||
}
|
||||
|
||||
if ! ceph --cluster "${CLUSTER}" osd crush rule ls | grep -q "^rack_replicated_rule$"; then
|
||||
ceph --cluster "${CLUSTER}" osd crush rule create-simple rack_replicated_rule default rack
|
||||
fi
|
||||
{{- range $crush_rule := .Values.conf.pool.crush_rules -}}
|
||||
{{- with $crush_rule }}
|
||||
create_crushrule {{ .name }} {{ .crush_rule }} {{ .failure_domain }} {{ .device_class }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
function reweight_osds () {
|
||||
for OSD_ID in $(ceph --cluster "${CLUSTER}" osd df | awk '$3 == "0" {print $1}'); do
|
||||
@ -105,28 +111,35 @@ function manage_pool () {
|
||||
POOL_APPLICATION=$1
|
||||
POOL_NAME=$2
|
||||
POOL_REPLICATION=$3
|
||||
TOTAL_OSDS=$4
|
||||
TOTAL_DATA_PERCENT=$5
|
||||
TARGET_PG_PER_OSD=$6
|
||||
POOL_CRUSH_RULE=$7
|
||||
POOL_PROTECTION=$8
|
||||
TOTAL_DATA_PERCENT=$4
|
||||
TARGET_PG_PER_OSD=$5
|
||||
POOL_CRUSH_RULE=$6
|
||||
POOL_PROTECTION=$7
|
||||
TOTAL_OSDS=$(ceph --cluster "${CLUSTER}" osd ls | wc -l)
|
||||
if (ceph --cluster "${CLUSTER}" osd crush rule dump "${POOL_CRUSH_RULE}" | awk '/item_name/' | grep -q ssd); then
|
||||
TOTAL_OSDS=$(ceph --cluster "${CLUSTER}" osd tree | grep "ssd" | wc -l)
|
||||
elif (ceph --cluster "${CLUSTER}" osd crush rule dump "${POOL_CRUSH_RULE}" | awk '/item_name/' | grep -q hdd); then
|
||||
TOTAL_OSDS=$(ceph --cluster "${CLUSTER}" osd tree | grep "hdd" | wc -l)
|
||||
fi
|
||||
POOL_PLACEMENT_GROUPS=$(/tmp/pool-calc.py ${POOL_REPLICATION} ${TOTAL_OSDS} ${TOTAL_DATA_PERCENT} ${TARGET_PG_PER_OSD})
|
||||
create_pool "${POOL_APPLICATION}" "${POOL_NAME}" "${POOL_REPLICATION}" "${POOL_PLACEMENT_GROUPS}" "${POOL_CRUSH_RULE}" "${POOL_PROTECTION}"
|
||||
}
|
||||
|
||||
reweight_osds
|
||||
|
||||
{{ $targetNumOSD := .Values.conf.pool.target.osd }}
|
||||
{{ $targetPGperOSD := .Values.conf.pool.target.pg_per_osd }}
|
||||
{{ $crushRuleDefault := .Values.conf.pool.default.crush_rule }}
|
||||
{{ $targetProtection := .Values.conf.pool.target.protected | default "false" | quote | lower }}
|
||||
{{- range $pool := .Values.conf.pool.spec -}}
|
||||
{{- with $pool }}
|
||||
manage_pool {{ .application }} {{ .name }} {{ .replication }} {{ $targetNumOSD }} {{ .percent_total_data }} {{ $targetPGperOSD }} {{ $crushRuleDefault }} {{ $targetProtection }}
|
||||
{{- if .crush_rule }}
|
||||
manage_pool {{ .application }} {{ .name }} {{ .replication }} {{ .percent_total_data }} {{ $targetPGperOSD }} {{ .crush_rule }} {{ $targetProtection }}
|
||||
{{ else }}
|
||||
manage_pool {{ .application }} {{ .name }} {{ .replication }} {{ .percent_total_data }} {{ $targetPGperOSD }} {{ $crushRuleDefault }} {{ $targetProtection }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{- if .Values.conf.pool.crush.tunables }}
|
||||
ceph --cluster "${CLUSTER}" osd crush tunables {{ .Values.conf.pool.crush.tunables }}
|
||||
{{- end }}
|
||||
|
||||
|
@ -123,17 +123,48 @@ conf:
|
||||
tunables: null
|
||||
target:
|
||||
#NOTE(portdirect): arbitrarily we set the default number of expected OSD's to 5
|
||||
# to match the number of nodes in the OSH gate.
|
||||
# to match the number of nodes in the OSH gate (used only for helm tests).
|
||||
osd: 5
|
||||
pg_per_osd: 100
|
||||
protected: true
|
||||
default:
|
||||
# NOTE(supamatt): Accepted values are:
|
||||
# same_host for a single node
|
||||
# replicated_rule for a multi node
|
||||
# rack_replicated_rule for a multi node in multiple (>=3) racks
|
||||
# Ceph cluster must be in a healthy state.
|
||||
# NOTE(supamatt): Accepted values are taken from `crush_rules` list.
|
||||
crush_rule: replicated_rule
|
||||
crush_rules:
|
||||
# NOTE(supamatt): Device classes must remain undefined if all OSDs are the
|
||||
# same device type of backing disks (ie, all HDD or all SDD).
|
||||
- name: same_host
|
||||
crush_rule: create-simple
|
||||
failure_domain: osd
|
||||
device_class:
|
||||
- name: replicated_rule
|
||||
crush_rule: create-simple
|
||||
failure_domain: host
|
||||
device_class:
|
||||
- name: rack_replicated_rule
|
||||
crush_rule: create-simple
|
||||
failure_domain: rack
|
||||
device_class:
|
||||
# - name: replicated_rule-ssd
|
||||
# crush_rule: create-replicated
|
||||
# failure_domain: host
|
||||
# device_class: sdd
|
||||
# - name: replicated_rule-hdd
|
||||
# crush_rule: create-replicated
|
||||
# failure_domain: host
|
||||
# device_class: hdd
|
||||
# - name: rack_replicated_rule-ssd
|
||||
# crush_rule: create-replicated
|
||||
# failure_domain: rack
|
||||
# device_class: ssd
|
||||
# - name: rack_replicated_rule-hdd
|
||||
# crush_rule: create-replicated
|
||||
# failure_domain: rack
|
||||
# device_class: hdd
|
||||
# - name: row_replicated_rule
|
||||
# crush_rule: create-simple
|
||||
# failure_domain: row
|
||||
# device_class:
|
||||
|
||||
# NOTE(portdirect): this section describes the pools that will be managed by
|
||||
# the ceph pool management job, as it tunes the pgs and crush rule, based on
|
||||
@ -144,6 +175,10 @@ conf:
|
||||
application: rbd
|
||||
replication: 3
|
||||
percent_total_data: 40
|
||||
# NOTE(supamatt): By default the crush rules used to create each pool will be
|
||||
# taken from the pool default `crush_rule` unless a pool specific `crush_rule`
|
||||
# is specified. The rule MUST exist for it to be defined here.
|
||||
# crush_rule: replicated_rule
|
||||
# CephFS pools
|
||||
- name: cephfs_metadata
|
||||
application: cephfs
|
||||
@ -214,6 +249,7 @@ conf:
|
||||
application: rgw
|
||||
replication: 3
|
||||
percent_total_data: 34.8
|
||||
|
||||
ceph:
|
||||
global:
|
||||
# auth
|
||||
|
@ -126,27 +126,40 @@ OSD_PATH="${OSD_PATH_BASE}-${OSD_ID}"
|
||||
OSD_KEYRING="${OSD_PATH}/keyring"
|
||||
# NOTE(supamatt): set the initial crush weight of the OSD to 0 to prevent automatic rebalancing
|
||||
OSD_WEIGHT=0
|
||||
if [ "x${CRUSH_RULE}" == "xrack_replicated_rule" ]; then
|
||||
RACK_LOCATION=$(echo rack_$(echo ${HOSTNAME} | cut -c ${RACK_REGEX}))
|
||||
CRUSH_LOCATION=$(echo "root=default rack=${RACK_LOCATION} host=${HOSTNAME}")
|
||||
function crush_create_or_move {
|
||||
local crush_location=${1}
|
||||
ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
|
||||
osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${CRUSH_LOCATION} || true
|
||||
RACK_LOCATION_CHECK=$(ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" osd find ${OSD_ID} | awk -F'"' '/rack/{print $4}')
|
||||
if [ "x${RACK_LOCATION_CHECK}" != x${RACK_LOCATION} ]; then
|
||||
osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${crush_location} || true
|
||||
}
|
||||
function crush_add_and_move {
|
||||
local crush_failure_domain_type=${1}
|
||||
local crush_failure_domain_name=${2}
|
||||
local crush_location=$(echo "root=default ${crush_failure_domain_type}=${crush_failure_domain_name} host=${HOSTNAME}")
|
||||
crush_create_or_move "${crush_location}"
|
||||
local crush_failure_domain_location_check=$(ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" osd find ${OSD_ID} | grep "${crush_failure_domain_type}" | awk -F '"' '{print $4}')
|
||||
if [ "x${crush_failure_domain_location_check}" != "x${crush_failure_domain_name}" ]; then
|
||||
# NOTE(supamatt): Manually move the buckets for previously configured CRUSH configurations
|
||||
# as create-or-move may not appropiately move them.
|
||||
ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
|
||||
osd crush add-bucket ${RACK_LOCATION} rack || true
|
||||
osd crush add-bucket "${crush_failure_domain_name}" "${crush_failure_domain_type}" || true
|
||||
ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
|
||||
osd crush move ${RACK_LOCATION} root=default || true
|
||||
osd crush move "${crush_failure_domain_name}" root=default || true
|
||||
ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
|
||||
osd crush move ${HOSTNAME} rack=${RACK_LOCATION} || true
|
||||
osd crush move "${HOSTNAME}" "${crush_failure_domain_type}=${crush_failure_domain_name}" || true
|
||||
fi
|
||||
}
|
||||
if [ "x${CRUSH_FAILURE_DOMAIN_TYPE}" != "host" ]; then
|
||||
if [ "x${CRUSH_FAILURE_DOMAIN_NAME}" != "xfalse" ]; then
|
||||
crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "${CRUSH_FAILURE_DOMAIN_NAME}"
|
||||
elif [ "x${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}" != "xfalse" ]; then
|
||||
crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "$(echo ${CRUSH_FAILURE_DOMAIN_TYPE}_$(echo ${HOSTNAME} | cut -c ${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}))"
|
||||
else
|
||||
# NOTE(supamatt): neither variables are defined then we fall back to expected default behavior
|
||||
crush_create_or_move "${CRUSH_LOCATION}"
|
||||
fi
|
||||
else
|
||||
ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
|
||||
osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${CRUSH_LOCATION} || true
|
||||
crush_create_or_move "${CRUSH_LOCATION}"
|
||||
fi
|
||||
|
||||
if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then
|
||||
if [ -n "${OSD_JOURNAL}" ]; then
|
||||
if [ -b "${OSD_JOURNAL}" ]; then
|
||||
|
@ -73,7 +73,40 @@ if [[ -n "$(find /var/lib/ceph/osd -prune -empty)" ]]; then
|
||||
# add the osd to the crush map
|
||||
# NOTE(supamatt): set the initial crush weight of the OSD to 0 to prevent automatic rebalancing
|
||||
OSD_WEIGHT=0
|
||||
ceph --name=osd.${OSD_ID} --keyring=${OSD_KEYRING} osd crush create-or-move -- ${OSD_ID} ${OSD_WEIGHT} ${CRUSH_LOCATION}
|
||||
function crush_create_or_move {
|
||||
local crush_location=${1}
|
||||
ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
|
||||
osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${crush_location} || true
|
||||
}
|
||||
function crush_add_and_move {
|
||||
local crush_failure_domain_type=${1}
|
||||
local crush_failure_domain_name=${2}
|
||||
local crush_location=$(echo "root=default ${crush_failure_domain_type}=${crush_failure_domain_name} host=${HOSTNAME}")
|
||||
crush_create_or_move "${crush_location}"
|
||||
local crush_failure_domain_location_check=$(ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" osd find ${OSD_ID} | grep "${crush_failure_domain_type}" | awk -F '"' '{print $4}')
|
||||
if [ "x${crush_failure_domain_location_check}" != "x${crush_failure_domain_name}" ]; then
|
||||
# NOTE(supamatt): Manually move the buckets for previously configured CRUSH configurations
|
||||
# as create-or-move may not appropiately move them.
|
||||
ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
|
||||
osd crush add-bucket "${crush_failure_domain_name}" "${crush_failure_domain_type}" || true
|
||||
ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
|
||||
osd crush move "${crush_failure_domain_name}" root=default || true
|
||||
ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
|
||||
osd crush move "${HOSTNAME}" "${crush_failure_domain_type}=${crush_failure_domain_name}" || true
|
||||
fi
|
||||
}
|
||||
if [ "x${CRUSH_FAILURE_DOMAIN_TYPE}" != "host" ]; then
|
||||
if [ "x${CRUSH_FAILURE_DOMAIN_NAME}" != "xfalse" ]; then
|
||||
crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "${CRUSH_FAILURE_DOMAIN_NAME}"
|
||||
elif [ "x${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}" != "xfalse" ]; then
|
||||
crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "$(echo ${CRUSH_FAILURE_DOMAIN_TYPE}_$(echo ${HOSTNAME} | cut -c ${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}))"
|
||||
else
|
||||
# NOTE(supamatt): neither variables are defined then we fall back to default behavior
|
||||
crush_create_or_move "${CRUSH_LOCATION}"
|
||||
fi
|
||||
else
|
||||
crush_create_or_move "${CRUSH_LOCATION}"
|
||||
fi
|
||||
fi
|
||||
|
||||
# create the directory and an empty Procfile
|
||||
|
@ -179,10 +179,12 @@ spec:
|
||||
value: "ceph"
|
||||
- name: CEPH_GET_ADMIN_KEY
|
||||
value: "1"
|
||||
- name: CRUSH_RULE
|
||||
value: {{ .Values.conf.pool.default.crush_rule }}
|
||||
- name: RACK_REGEX
|
||||
value: {{ .Values.conf.pool.default.rack_regex }}
|
||||
- name: CRUSH_FAILURE_DOMAIN_TYPE
|
||||
value: {{ .Values.conf.storage.failure_domain | default "host" | quote }}
|
||||
- name: CRUSH_FAILURE_DOMAIN_NAME
|
||||
value: {{ .Values.conf.storage.failure_domain_name | default "false" | quote }}
|
||||
- name: CRUSH_FAILURE_DOMAIN_BY_HOSTNAME
|
||||
value: {{ .Values.conf.storage.failure_domain_by_hostname | default "false" | quote }}
|
||||
command:
|
||||
- /tmp/osd-start.sh
|
||||
lifecycle:
|
||||
|
@ -106,20 +106,21 @@ conf:
|
||||
osd_recovery_max_active: 1
|
||||
osd_mount_options_xfs: "rw,noatime,largeio,inode64,swalloc,logbufs=8,logbsize=256k,allocsize=4M"
|
||||
osd_journal_size: 10240
|
||||
|
||||
pool:
|
||||
default:
|
||||
# NOTE(supamatt): Accepted values are:
|
||||
# same_host for a single node
|
||||
# replicated_rule for a multi node
|
||||
# rack_replicated_rule for a multi node in multiple (>=3) racks
|
||||
# Ceph cluster must be in a healthy state.
|
||||
crush_rule: replicated_rule
|
||||
# NOTE(supamatt): By default use the first 8 characters of the hostname to
|
||||
# define the the rack type bucket names for CRUSH.
|
||||
rack_regex: "1-8"
|
||||
osd_crush_update_on_start: false
|
||||
|
||||
storage:
|
||||
# NOTE(supamatt): By default use host based buckets for failure domains. Any `failure_domain` defined must
|
||||
# match the failure domain used on your CRUSH rules for pools. For example with a crush rule of
|
||||
# rack_replicated_rule you would specify "rack" as the `failure_domain` to use.
|
||||
# `failure_domain`: Set the CRUSH bucket type for your OSD to reside in. See the supported CRUSH configuration
|
||||
# as listed here: Supported CRUSH configuration is listed here: http://docs.ceph.com/docs/luminous/rados/operations/crush-map/
|
||||
# `failure_domain_by_hostname`: Specify the portion of the hostname to use for your failure domain bucket name.
|
||||
# `failure_domain_name`: Manually name the failure domain bucket name. This configuration option should only be used
|
||||
# when using host based overrides.
|
||||
# failure_domain: "rack"
|
||||
# failure_domain_by_hostname: 1-8
|
||||
# failure_domain_name: false
|
||||
|
||||
# NOTE(portdirect): for homogeneous clusters the `osd` key can be used to
|
||||
# define OSD pods that will be deployed across the cluster.
|
||||
osd:
|
||||
@ -149,6 +150,7 @@ conf:
|
||||
# - name: host1.fqdn
|
||||
# conf:
|
||||
# storage:
|
||||
# failure_domain_name: "rack1"
|
||||
# osd:
|
||||
# - data:
|
||||
# type: directory
|
||||
|
Loading…
Reference in New Issue
Block a user