4841f53ca6
The PS improves performance by replacing lsblk/blkid (In some cases blkid may be pretty slow). Also it allows to avoid deadlocks when there are RBDs mapped on the host. Change-Id: If607e168515f55478e9e55e421738d2d00269d3f
409 lines
15 KiB
Smarty
409 lines
15 KiB
Smarty
#!/bin/bash
|
|
|
|
{{/*
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/}}
|
|
|
|
set -ex
|
|
export PS4='+${BASH_SOURCE:+$(basename ${BASH_SOURCE}):${LINENO}:}${FUNCNAME:+${FUNCNAME}():} '
|
|
|
|
: "${CRUSH_LOCATION:=root=default host=${HOSTNAME}}"
|
|
: "${OSD_PATH_BASE:=/var/lib/ceph/osd/${CLUSTER}}"
|
|
: "${CEPH_CONF:="/etc/ceph/${CLUSTER}.conf"}"
|
|
: "${OSD_BOOTSTRAP_KEYRING:=/var/lib/ceph/bootstrap-osd/${CLUSTER}.keyring}"
|
|
: "${OSD_JOURNAL_UUID:=$(uuidgen)}"
|
|
: "${OSD_JOURNAL_SIZE:=$(awk '/^osd_journal_size/{print $3}' ${CEPH_CONF}.template)}"
|
|
: "${OSD_WEIGHT:=1.0}"
|
|
|
|
eval CRUSH_FAILURE_DOMAIN_TYPE=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain"]))')
|
|
eval CRUSH_FAILURE_DOMAIN_NAME=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain_name"]))')
|
|
eval CRUSH_FAILURE_DOMAIN_BY_HOSTNAME=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain_by_hostname"]))')
|
|
eval CRUSH_FAILURE_DOMAIN_FROM_HOSTNAME_MAP=$(cat /etc/ceph/storage.json | jq '.failure_domain_by_hostname_map."'$HOSTNAME'"')
|
|
eval DEVICE_CLASS=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["device_class"]))')
|
|
|
|
if [[ $(ceph -v | egrep -q "nautilus|mimic|luminous"; echo $?) -ne 0 ]]; then
|
|
echo "ERROR- need Luminous/Mimic/Nautilus release"
|
|
exit 1
|
|
fi
|
|
|
|
if [ -z "${HOSTNAME}" ]; then
|
|
echo "HOSTNAME not set; This will prevent to add an OSD into the CRUSH map"
|
|
exit 1
|
|
fi
|
|
|
|
if [[ ! -e ${CEPH_CONF}.template ]]; then
|
|
echo "ERROR- ${CEPH_CONF}.template must exist; get it from your existing mon"
|
|
exit 1
|
|
else
|
|
ENDPOINT=$(kubectl get endpoints ceph-mon-discovery -n ${NAMESPACE} -o json | awk -F'"' -v port=${MON_PORT} \
|
|
-v version=v1 -v msgr_version=v2 \
|
|
-v msgr2_port=${MON_PORT_V2} \
|
|
'/"ip"/{print "["version":"$4":"port"/"0","msgr_version":"$4":"msgr2_port"/"0"]"}' | paste -sd',')
|
|
if [[ "${ENDPOINT}" == "" ]]; then
|
|
/bin/sh -c -e "cat ${CEPH_CONF}.template | tee ${CEPH_CONF}" || true
|
|
else
|
|
/bin/sh -c -e "cat ${CEPH_CONF}.template | sed 's#mon_host.*#mon_host = ${ENDPOINT}#g' | tee ${CEPH_CONF}" || true
|
|
fi
|
|
fi
|
|
|
|
# Wait for a file to exist, regardless of the type
|
|
function wait_for_file {
|
|
timeout 10 bash -c "while [ ! -e ${1} ]; do echo 'Waiting for ${1} to show up' && sleep 1 ; done"
|
|
}
|
|
|
|
function is_available {
|
|
command -v $@ &>/dev/null
|
|
}
|
|
|
|
function ceph_cmd_retry() {
|
|
cnt=0
|
|
until "ceph" "$@" || [ $cnt -ge 6 ]; do
|
|
sleep 10
|
|
((cnt++))
|
|
done
|
|
}
|
|
|
|
function locked() {
|
|
exec {lock_fd}>/var/lib/ceph/tmp/init-osd.lock || exit 1
|
|
flock -w 600 --verbose "${lock_fd}"
|
|
"$@"
|
|
flock -u "${lock_fd}"
|
|
}
|
|
|
|
function crush_create_or_move {
|
|
local crush_location=${1}
|
|
ceph_cmd_retry --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
|
|
osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${crush_location}
|
|
}
|
|
|
|
function crush_add_and_move {
|
|
local crush_failure_domain_type=${1}
|
|
local crush_failure_domain_name=${2}
|
|
local crush_location=$(echo "root=default ${crush_failure_domain_type}=${crush_failure_domain_name} host=${HOSTNAME}")
|
|
crush_create_or_move "${crush_location}"
|
|
local crush_failure_domain_location_check=$(ceph_cmd_retry --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" osd find ${OSD_ID} | grep "${crush_failure_domain_type}" | awk -F '"' '{print $4}')
|
|
if [ "x${crush_failure_domain_location_check}" != "x${crush_failure_domain_name}" ]; then
|
|
# NOTE(supamatt): Manually move the buckets for previously configured CRUSH configurations
|
|
# as create-or-move may not appropiately move them.
|
|
ceph_cmd_retry --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
|
|
osd crush add-bucket "${crush_failure_domain_name}" "${crush_failure_domain_type}" || true
|
|
ceph_cmd_retry --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
|
|
osd crush move "${crush_failure_domain_name}" root=default || true
|
|
ceph_cmd_retry --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
|
|
osd crush move "${HOSTNAME}" "${crush_failure_domain_type}=${crush_failure_domain_name}" || true
|
|
fi
|
|
}
|
|
|
|
function crush_location {
|
|
set_device_class
|
|
if [ "x${CRUSH_FAILURE_DOMAIN_TYPE}" != "xhost" ]; then
|
|
if [ "x${CRUSH_FAILURE_DOMAIN_NAME}" != "xfalse" ]; then
|
|
crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "${CRUSH_FAILURE_DOMAIN_NAME}"
|
|
elif [ "x${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}" != "xfalse" ]; then
|
|
crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "$(echo ${CRUSH_FAILURE_DOMAIN_TYPE}_$(echo ${HOSTNAME} | cut -c ${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}))"
|
|
elif [ "x${CRUSH_FAILURE_DOMAIN_FROM_HOSTNAME_MAP}" != "xnull" ]; then
|
|
crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "${CRUSH_FAILURE_DOMAIN_FROM_HOSTNAME_MAP}"
|
|
else
|
|
# NOTE(supamatt): neither variables are defined then we fall back to default behavior
|
|
crush_create_or_move "${CRUSH_LOCATION}"
|
|
fi
|
|
else
|
|
crush_create_or_move "${CRUSH_LOCATION}"
|
|
fi
|
|
}
|
|
|
|
# Calculate proper device names, given a device and partition number
|
|
function dev_part {
|
|
local osd_device=${1}
|
|
local osd_partition=${2}
|
|
|
|
if [[ -L ${osd_device} ]]; then
|
|
# This device is a symlink. Work out it's actual device
|
|
local actual_device=$(readlink -f "${osd_device}")
|
|
local bn=$(basename "${osd_device}")
|
|
if [[ "${actual_device:0-1:1}" == [0-9] ]]; then
|
|
local desired_partition="${actual_device}p${osd_partition}"
|
|
else
|
|
local desired_partition="${actual_device}${osd_partition}"
|
|
fi
|
|
# Now search for a symlink in the directory of $osd_device
|
|
# that has the correct desired partition, and the longest
|
|
# shared prefix with the original symlink
|
|
local symdir=$(dirname "${osd_device}")
|
|
local link=""
|
|
local pfxlen=0
|
|
for option in ${symdir}/*; do
|
|
[[ -e $option ]] || break
|
|
if [[ $(readlink -f "${option}") == "${desired_partition}" ]]; then
|
|
local optprefixlen=$(prefix_length "${option}" "${bn}")
|
|
if [[ ${optprefixlen} > ${pfxlen} ]]; then
|
|
link=${symdir}/${option}
|
|
pfxlen=${optprefixlen}
|
|
fi
|
|
fi
|
|
done
|
|
if [[ $pfxlen -eq 0 ]]; then
|
|
>&2 echo "Could not locate appropriate symlink for partition ${osd_partition} of ${osd_device}"
|
|
exit 1
|
|
fi
|
|
echo "$link"
|
|
elif [[ "${osd_device:0-1:1}" == [0-9] ]]; then
|
|
echo "${osd_device}p${osd_partition}"
|
|
else
|
|
echo "${osd_device}${osd_partition}"
|
|
fi
|
|
}
|
|
|
|
function zap_extra_partitions {
|
|
# Examine temp mount and delete any block.db and block.wal partitions
|
|
mountpoint=${1}
|
|
journal_disk=""
|
|
journal_part=""
|
|
block_db_disk=""
|
|
block_db_part=""
|
|
block_wal_disk=""
|
|
block_wal_part=""
|
|
|
|
# Discover journal, block.db, and block.wal partitions first before deleting anything
|
|
# If the partitions are on the same disk, deleting one can affect discovery of the other(s)
|
|
if [ -L "${mountpoint}/journal" ]; then
|
|
journal_disk=$(readlink -m ${mountpoint}/journal | sed 's/[0-9]*//g')
|
|
journal_part=$(readlink -m ${mountpoint}/journal | sed 's/[^0-9]*//g')
|
|
fi
|
|
if [ -L "${mountpoint}/block.db" ]; then
|
|
block_db_disk=$(readlink -m ${mountpoint}/block.db | sed 's/[0-9]*//g')
|
|
block_db_part=$(readlink -m ${mountpoint}/block.db | sed 's/[^0-9]*//g')
|
|
fi
|
|
if [ -L "${mountpoint}/block.wal" ]; then
|
|
block_wal_disk=$(readlink -m ${mountpoint}/block.wal | sed 's/[0-9]*//g')
|
|
block_wal_part=$(readlink -m ${mountpoint}/block.wal | sed 's/[^0-9]*//g')
|
|
fi
|
|
|
|
# Delete any discovered journal, block.db, and block.wal partitions
|
|
if [ ! -z "${journal_disk}" ]; then
|
|
sgdisk -d ${journal_part} ${journal_disk}
|
|
/sbin/udevadm settle --timeout=600
|
|
/usr/bin/flock -s ${journal_disk} /sbin/partprobe ${journal_disk}
|
|
/sbin/udevadm settle --timeout=600
|
|
fi
|
|
if [ ! -z "${block_db_disk}" ]; then
|
|
sgdisk -d ${block_db_part} ${block_db_disk}
|
|
/sbin/udevadm settle --timeout=600
|
|
/usr/bin/flock -s ${block_db_disk} /sbin/partprobe ${block_db_disk}
|
|
/sbin/udevadm settle --timeout=600
|
|
fi
|
|
if [ ! -z "${block_wal_disk}" ]; then
|
|
sgdisk -d ${block_wal_part} ${block_wal_disk}
|
|
/sbin/udevadm settle --timeout=600
|
|
/usr/bin/flock -s ${block_wal_disk} /sbin/partprobe ${block_wal_disk}
|
|
/sbin/udevadm settle --timeout=600
|
|
fi
|
|
}
|
|
|
|
function disk_zap {
|
|
# Run all the commands that ceph-disk zap uses to clear a disk
|
|
local device=${1}
|
|
local device_filter=$(basename "${device}")
|
|
local dm_devices=$(get_lvm_path_from_device "pv_name=~${device_filter},lv_name=~ceph")
|
|
for dm_device in ${dm_devices}; do
|
|
if [[ ! -z ${dm_device} ]]; then
|
|
dmsetup remove ${dm_device}
|
|
fi
|
|
done
|
|
local logical_volumes=$(locked lvdisplay | grep "LV Path" | grep "$device_filter" | awk '/ceph/{print $3}' | tr '\n' ' ')
|
|
for logical_volume in ${logical_volumes}; do
|
|
if [[ ! -z ${logical_volume} ]]; then
|
|
locked lvremove -y ${logical_volume}
|
|
fi
|
|
done
|
|
local volume_group=$(pvdisplay ${device} | grep "VG Name" | awk '/ceph/{print $3}' | grep "ceph")
|
|
if [[ ${volume_group} ]]; then
|
|
vgremove ${volume_group}
|
|
pvremove ${device}
|
|
ceph-volume lvm zap ${device} --destroy
|
|
fi
|
|
wipefs --all ${device}
|
|
sgdisk --zap-all -- ${device}
|
|
# Wipe the first 200MB boundary, as Bluestore redeployments will not work otherwise
|
|
dd if=/dev/zero of=${device} bs=1M count=200
|
|
}
|
|
|
|
function udev_settle {
|
|
osd_devices="${OSD_DEVICE}"
|
|
partprobe "${OSD_DEVICE}"
|
|
if [ "${OSD_BLUESTORE:-0}" -eq 1 ]; then
|
|
if [ ! -z "$BLOCK_DB" ]; then
|
|
osd_devices="${osd_devices}\|${BLOCK_DB}"
|
|
# BLOCK_DB could be a physical or logical device here
|
|
local block_db="$BLOCK_DB"
|
|
local db_vg="$(echo $block_db | cut -d'/' -f1)"
|
|
if [ ! -z "$db_vg" ]; then
|
|
block_db=$(locked pvdisplay | grep -B1 "$db_vg" | awk '/PV Name/{print $3}')
|
|
fi
|
|
locked partprobe "${block_db}"
|
|
fi
|
|
if [ ! -z "$BLOCK_WAL" ] && [ "$BLOCK_WAL" != "$BLOCK_DB" ]; then
|
|
osd_devices="${osd_devices}\|${BLOCK_WAL}"
|
|
# BLOCK_WAL could be a physical or logical device here
|
|
local block_wal="$BLOCK_WAL"
|
|
local wal_vg="$(echo $block_wal | cut -d'/' -f1)"
|
|
if [ ! -z "$wal_vg" ]; then
|
|
block_wal=$(locked pvdisplay | grep -B1 "$wal_vg" | awk '/PV Name/{print $3}')
|
|
fi
|
|
locked partprobe "${block_wal}"
|
|
fi
|
|
else
|
|
if [ "x$JOURNAL_TYPE" == "xblock-logical" ] && [ ! -z "$OSD_JOURNAL" ]; then
|
|
OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL})
|
|
if [ ! -z "$OSD_JOURNAL" ]; then
|
|
local JDEV=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g')
|
|
osd_devices="${osd_devices}\|${JDEV}"
|
|
locked partprobe "${JDEV}"
|
|
fi
|
|
fi
|
|
fi
|
|
# watch the udev event queue, and exit if all current events are handled
|
|
udevadm settle --timeout=600
|
|
|
|
# On occassion udev may not make the correct device symlinks for Ceph, just in case we make them manually
|
|
mkdir -p /dev/disk/by-partuuid
|
|
for dev in $(awk '!/rbd/{print $4}' /proc/partitions | grep "${osd_devices}" | grep "[0-9]"); do
|
|
diskdev=$(echo "${dev//[!a-z]/}")
|
|
partnum=$(echo "${dev//[!0-9]/}")
|
|
symlink="/dev/disk/by-partuuid/$(sgdisk -i ${partnum} /dev/${diskdev} | awk '/Partition unique GUID/{print tolower($4)}')"
|
|
if [ ! -e "${symlink}" ]; then
|
|
ln -s "../../${dev}" "${symlink}"
|
|
fi
|
|
done
|
|
|
|
# Give udev another chance now that all symlinks exist for devices we care about
|
|
udevadm settle --timeout=600
|
|
}
|
|
|
|
# Helper function to get an lvm tag from a logical volume
|
|
function get_lvm_tag_from_volume {
|
|
logical_volume="$1"
|
|
tag="$2"
|
|
|
|
if [[ "$#" -lt 2 ]] || [[ -z "${logical_volume}" ]]; then
|
|
# Return an empty string if the logical volume doesn't exist
|
|
echo
|
|
else
|
|
# Get and return the specified tag from the logical volume
|
|
locked lvs -o lv_tags ${logical_volume} | tr ',' '\n' | grep ${tag} | cut -d'=' -f2
|
|
fi
|
|
}
|
|
|
|
# Helper function to get an lvm tag from a physical device
|
|
function get_lvm_tag_from_device {
|
|
device="$1"
|
|
tag="$2"
|
|
# Attempt to get a logical volume for the physical device
|
|
logical_volume="$(locked pvdisplay -m ${device} | awk '/Logical volume/{print $3}')"
|
|
|
|
# Use get_lvm_tag_from_volume to get the specified tag from the logical volume
|
|
get_lvm_tag_from_volume ${logical_volume} ${tag}
|
|
}
|
|
|
|
# Helper function to get the size of a logical volume
|
|
function get_lv_size_from_device {
|
|
device="$1"
|
|
logical_volume="$(get_lv_from_device ${device})"
|
|
|
|
lvs ${logical_volume} -o LV_SIZE --noheadings --units k --nosuffix | xargs | cut -d'.' -f1
|
|
}
|
|
|
|
# Helper function to get the crush weight for an osd device
|
|
function get_osd_crush_weight_from_device {
|
|
device="$1"
|
|
lv_size="$(get_lv_size_from_device ${device})" # KiB
|
|
|
|
if [[ ! -z "${BLOCK_DB_SIZE}" ]]; then
|
|
db_size=$(echo "${BLOCK_DB_SIZE}" | cut -d'B' -f1 | numfmt --from=iec | awk '{print $1/1024}') # KiB
|
|
lv_size=$((lv_size+db_size)) # KiB
|
|
fi
|
|
|
|
echo ${lv_size} | awk '{printf("%.2f\n", $1/1073741824)}' # KiB to TiB
|
|
}
|
|
|
|
# Helper function to get a cluster FSID from a physical device
|
|
function get_cluster_fsid_from_device {
|
|
device="$1"
|
|
|
|
# Use get_lvm_tag_from_device to get the cluster FSID from the device
|
|
get_lvm_tag_from_device ${device} ceph.cluster_fsid
|
|
}
|
|
|
|
# Helper function to get an OSD ID from a logical volume
|
|
function get_osd_id_from_volume {
|
|
logical_volume="$1"
|
|
|
|
# Use get_lvm_tag_from_volume to get the OSD ID from the logical volume
|
|
get_lvm_tag_from_volume ${logical_volume} ceph.osd_id
|
|
}
|
|
|
|
# Helper function get an OSD ID from a physical device
|
|
function get_osd_id_from_device {
|
|
device="$1"
|
|
|
|
# Use get_lvm_tag_from_device to get the OSD ID from the device
|
|
get_lvm_tag_from_device ${device} ceph.osd_id
|
|
}
|
|
|
|
# Helper function get an OSD FSID from a physical device
|
|
function get_osd_fsid_from_device {
|
|
device="$1"
|
|
|
|
# Use get_lvm_tag_from_device to get the OSD FSID from the device
|
|
get_lvm_tag_from_device ${device} ceph.osd_fsid
|
|
}
|
|
|
|
# Helper function get an OSD DB device from a physical device
|
|
function get_osd_db_device_from_device {
|
|
device="$1"
|
|
|
|
# Use get_lvm_tag_from_device to get the OSD DB device from the device
|
|
get_lvm_tag_from_device ${device} ceph.db_device
|
|
}
|
|
|
|
# Helper function get an OSD WAL device from a physical device
|
|
function get_osd_wal_device_from_device {
|
|
device="$1"
|
|
|
|
# Use get_lvm_tag_from_device to get the OSD WAL device from the device
|
|
get_lvm_tag_from_device ${device} ceph.wal_device
|
|
}
|
|
|
|
function get_lvm_path_from_device {
|
|
select="$1"
|
|
|
|
options="--noheadings -o lv_dm_path"
|
|
pvs ${options} -S "${select}" | tr -d ' '
|
|
}
|
|
|
|
function set_device_class {
|
|
if [ ! -z "$DEVICE_CLASS" ]; then
|
|
if [ "x$DEVICE_CLASS" != "x$(get_device_class)" ]; then
|
|
ceph_cmd_retry --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
|
|
osd crush rm-device-class "osd.${OSD_ID}"
|
|
ceph_cmd_retry --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
|
|
osd crush set-device-class "${DEVICE_CLASS}" "osd.${OSD_ID}"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
function get_device_class {
|
|
echo $(ceph_cmd_retry --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
|
|
osd crush get-device-class "osd.${OSD_ID}")
|
|
}
|