Enable Ceph charts to be rack aware for CRUSH

Add support for a rack level CRUSH map. Rack level CRUSH support is
enabled by using the "rack_replicated_rule" crush rule.

Change-Id: I4df224f2821872faa2eddec2120832e9a22f4a7c
This commit is contained in:
Matthew Heler 2018-11-16 12:20:52 -06:00
parent 5d356f9265
commit 5ce9f2eb3b
5 changed files with 46 additions and 9 deletions

View File

@ -37,6 +37,10 @@ if ! ceph --cluster "${CLUSTER}" osd crush rule ls | grep -q "^same_host$"; then
ceph --cluster "${CLUSTER}" osd crush rule create-simple same_host default osd
fi
if ! ceph --cluster "${CLUSTER}" osd crush rule ls | grep -q "^rack_replicated_rule$"; then
ceph --cluster "${CLUSTER}" osd crush rule create-simple rack_replicated_rule default rack
fi
function reweight_osds () {
for OSD_ID in $(ceph --cluster "${CLUSTER}" osd df | awk '$3 == "0" {print $1}'); do
OSD_WEIGHT=$(ceph --cluster "${CLUSTER}" osd df --format json-pretty| grep -A7 "\bosd.${OSD_ID}\b" | awk '/"kb"/{ gsub(",",""); d= $2/1073741824 ; r = sprintf("%.2f", d); print r }');

View File

@ -128,9 +128,13 @@ conf:
pg_per_osd: 100
protected: true
default:
#NOTE(portdirect): this should be 'same_host' for a single node
# cluster to be in a healthy state
# NOTE(supamatt): Accepted values are:
# same_host for a single node
# replicated_rule for a multi node
# rack_replicated_rule for a multi node in multiple (>=3) racks
# Ceph cluster must be in a healthy state.
crush_rule: replicated_rule
#NOTE(portdirect): this section describes the pools that will be managed by
# the ceph pool management job, as it tunes the pgs and crush rule, based on
# the above.

View File

@ -126,13 +126,26 @@ OSD_PATH="${OSD_PATH_BASE}-${OSD_ID}"
OSD_KEYRING="${OSD_PATH}/keyring"
# NOTE(supamatt): set the initial crush weight of the OSD to 0 to prevent automatic rebalancing
OSD_WEIGHT=0
ceph \
--cluster "${CLUSTER}" \
--name="osd.${OSD_ID}" \
--keyring="${OSD_KEYRING}" \
osd \
crush \
create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${CRUSH_LOCATION}
if [ "x${CRUSH_RULE}" == "xrack_replicated_rule" ]; then
RACK_LOCATION=$(echo rack_$(echo ${HOSTNAME} | cut -c ${RACK_REGEX}))
CRUSH_LOCATION=$(echo "root=default rack=${RACK_LOCATION} host=${HOSTNAME}")
ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${CRUSH_LOCATION} || true
RACK_LOCATION_CHECK=$(ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" osd find ${OSD_ID} | awk -F'"' '/rack/{print $4}')
if [ "x${RACK_LOCATION_CHECK}" != x${RACK_LOCATION} ]; then
# NOTE(supamatt): Manually move the buckets for previously configured CRUSH configurations
# as create-or-move may not appropiately move them.
ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
osd crush add-bucket ${RACK_LOCATION} rack || true
ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
osd crush move ${RACK_LOCATION} root=default || true
ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
osd crush move ${HOSTNAME} rack=${RACK_LOCATION} || true
fi
else
ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${CRUSH_LOCATION} || true
fi
if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then
if [ -n "${OSD_JOURNAL}" ]; then

View File

@ -179,6 +179,10 @@ spec:
value: "ceph"
- name: CEPH_GET_ADMIN_KEY
value: "1"
- name: CRUSH_RULE
value: {{ .Values.conf.pool.default.crush_rule }}
- name: RACK_REGEX
value: {{ .Values.conf.pool.default.rack_regex }}
command:
- /tmp/osd-start.sh
lifecycle:

View File

@ -107,6 +107,18 @@ conf:
osd_mount_options_xfs: "rw,noatime,largeio,inode64,swalloc,logbufs=8,logbsize=256k,allocsize=4M"
osd_journal_size: 10240
pool:
default:
# NOTE(supamatt): Accepted values are:
# same_host for a single node
# replicated_rule for a multi node
# rack_replicated_rule for a multi node in multiple (>=3) racks
# Ceph cluster must be in a healthy state.
crush_rule: replicated_rule
# NOTE(supamatt): By default use the first 8 characters of the hostname to
# define the the rack type bucket names for CRUSH.
rack_regex: "1-8"
storage:
# NOTE(portdirect): for homogeneous clusters the `osd` key can be used to
# define OSD pods that will be deployed across the cluster.