
The PS switches back to old naming convention for VGs. The old naming convention have to be used until the changes which allow to handle update of VG names are merged ( https://review.opendev.org/#/c/745166/ ). Otherwise, OSDs will not come up after an upgrade. Change-Id: I1bf9ca93149a93dfd5f79813533ace3a1fe58002
350 lines
14 KiB
Smarty
350 lines
14 KiB
Smarty
#!/bin/bash
|
|
|
|
{{/*
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/}}
|
|
|
|
set -ex
|
|
|
|
source /tmp/osd-common-ceph-volume.sh
|
|
|
|
: "${OSD_FORCE_REPAIR:=1}"
|
|
# We do not want to zap journal disk. Tracking this option seperatly.
|
|
: "${JOURNAL_FORCE_ZAP:=0}"
|
|
|
|
if [ "x${STORAGE_TYPE%-*}" == "xbluestore" ]; then
|
|
export OSD_BLUESTORE=1
|
|
fi
|
|
|
|
if [ "x${STORAGE_TYPE%-*}" == "xdirectory" ]; then
|
|
export OSD_DEVICE="/var/lib/ceph/osd"
|
|
else
|
|
export OSD_DEVICE=$(readlink -f ${STORAGE_LOCATION})
|
|
fi
|
|
|
|
if [ "x$JOURNAL_TYPE" == "xdirectory" ]; then
|
|
export OSD_JOURNAL="/var/lib/ceph/journal"
|
|
else
|
|
export OSD_JOURNAL=$(readlink -f ${JOURNAL_LOCATION})
|
|
fi
|
|
|
|
function prep_device {
|
|
local BLOCK_DEVICE=$1
|
|
local BLOCK_DEVICE_SIZE=$2
|
|
local device_type=$3
|
|
local device_string VG DEVICE_OSD_ID logical_devices logical_volume
|
|
device_string=$(echo "${BLOCK_DEVICE#/}" | tr '/' '-')
|
|
VG=$(vgs --noheadings -o vg_name -S "vg_name=ceph-db-wal-${device_string}" | tr -d '[:space:]')
|
|
if [[ $VG ]]; then
|
|
DEVICE_OSD_ID=$(get_osd_id_from_volume "/dev/ceph-db-wal-${device_string}/ceph-${device_type}-${osd_dev_string}")
|
|
CEPH_LVM_PREPARE=1
|
|
if [ -n "${OSD_ID}" ]; then
|
|
if [ "${DEVICE_OSD_ID}" == "${OSD_ID}" ]; then
|
|
CEPH_LVM_PREPARE=0
|
|
else
|
|
disk_zap "${OSD_DEVICE}"
|
|
fi
|
|
fi
|
|
else
|
|
logical_devices=$(get_lvm_path_from_device "pv_name=~${BLOCK_DEVICE},lv_name=~dev-${osd_dev_split}")
|
|
if [[ -n "$logical_devices" ]]; then
|
|
dmsetup remove $logical_devices
|
|
disk_zap "${OSD_DEVICE}"
|
|
CEPH_LVM_PREPARE=1
|
|
fi
|
|
VG=ceph-db-wal-${device_string}
|
|
locked vgcreate "$VG" "${BLOCK_DEVICE}"
|
|
fi
|
|
logical_volume=$(lvs --noheadings -o lv_name -S "lv_name=ceph-${device_type}-${osd_dev_string}" | tr -d '[:space:]')
|
|
if [[ $logical_volume != "ceph-${device_type}-${osd_dev_string}" ]]; then
|
|
locked lvcreate -L "${BLOCK_DEVICE_SIZE}" -n "ceph-${device_type}-${osd_dev_string}" "${VG}"
|
|
fi
|
|
if [[ "${device_type}" == "db" ]]; then
|
|
BLOCK_DB="${VG}/ceph-${device_type}-${osd_dev_string}"
|
|
elif [[ "${device_type}" == "wal" ]]; then
|
|
BLOCK_WAL="${VG}/ceph-${device_type}-${osd_dev_string}"
|
|
fi
|
|
}
|
|
|
|
function osd_disk_prepare {
|
|
if [[ -z "${OSD_DEVICE}" ]];then
|
|
echo "ERROR- You must provide a device to build your OSD ie: /dev/sdb"
|
|
exit 1
|
|
fi
|
|
|
|
if [[ ! -b "${OSD_DEVICE}" ]]; then
|
|
echo "ERROR- The device pointed by OSD_DEVICE ($OSD_DEVICE) doesn't exist !"
|
|
exit 1
|
|
fi
|
|
|
|
if [ ! -e $OSD_BOOTSTRAP_KEYRING ]; then
|
|
echo "ERROR- $OSD_BOOTSTRAP_KEYRING must exist. You can extract it from your current monitor by running 'ceph auth get client.bootstrap-osd -o $OSD_BOOTSTRAP_KEYRING'"
|
|
exit 1
|
|
fi
|
|
timeout 10 ceph ${CLI_OPTS} --name client.bootstrap-osd --keyring $OSD_BOOTSTRAP_KEYRING health || exit 1
|
|
|
|
#search for some ceph metadata on the disk based on the status of the disk/lvm in filestore
|
|
CEPH_DISK_USED=0
|
|
CEPH_LVM_PREPARE=1
|
|
osd_dev_string=$(echo ${OSD_DEVICE} | awk -F "/" '{print $2}{print $3}' | paste -s -d'-')
|
|
osd_dev_split=$(basename "${OSD_DEVICE}")
|
|
udev_settle
|
|
OSD_ID=$(get_osd_id_from_device ${OSD_DEVICE})
|
|
OSD_FSID=$(get_cluster_fsid_from_device ${OSD_DEVICE})
|
|
CLUSTER_FSID=$(ceph-conf --lookup fsid)
|
|
DISK_ZAPPED=0
|
|
|
|
if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then
|
|
if [[ ! -z ${OSD_ID} ]]; then
|
|
DM_NUM=$(dmsetup ls | grep $(lsblk -J ${OSD_DEVICE} | jq -r '.blockdevices[].children[].name') | awk '{print $2}' | cut -d':' -f2 | cut -d')' -f1)
|
|
DM_DEV="/dev/dm-"${DM_NUM}
|
|
elif [[ $(sgdisk --print ${OSD_DEVICE} | grep "F800") ]]; then
|
|
DM_DEV=${OSD_DEVICE}$(sgdisk --print ${OSD_DEVICE} | grep "F800" | awk '{print $1}')
|
|
CEPH_DISK_USED=1
|
|
else
|
|
if [[ ${OSD_FORCE_REPAIR} -eq 1 ]]; then
|
|
echo "It looks like ${OSD_DEVICE} isn't consistent, however OSD_FORCE_REPAIR is enabled so we are zapping the device anyway"
|
|
disk_zap ${OSD_DEVICE}
|
|
DISK_ZAPPED=1
|
|
else
|
|
echo "Regarding parted, device ${OSD_DEVICE} is inconsistent/broken/weird."
|
|
echo "It would be too dangerous to destroy it without any notification."
|
|
echo "Please set OSD_FORCE_REPAIR to '1' if you really want to zap this disk."
|
|
exit 1
|
|
fi
|
|
fi
|
|
else
|
|
if [[ ! -z "${OSD_FSID}" ]]; then
|
|
if [[ "${OSD_FSID}" == "${CLUSTER_FSID}" ]]; then
|
|
if [[ ! -z "${OSD_ID}" ]]; then
|
|
if ceph --name client.bootstrap-osd --keyring $OSD_BOOTSTRAP_KEYRING osd ls |grep -w ${OSD_ID}; then
|
|
echo "Running bluestore mode and ${OSD_DEVICE} already bootstrapped"
|
|
elif [[ $OSD_FORCE_REPAIR -eq 1 ]]; then
|
|
echo "OSD initialized for this cluster, but OSD ID not found in the cluster, reinitializing"
|
|
else
|
|
echo "OSD initialized for this cluster, but OSD ID not found in the cluster"
|
|
fi
|
|
fi
|
|
else
|
|
echo "OSD initialized for a different cluster, zapping it"
|
|
disk_zap ${OSD_DEVICE}
|
|
udev_settle
|
|
fi
|
|
elif [[ $(sgdisk --print ${OSD_DEVICE} | grep "F800") ]]; then
|
|
DM_DEV=${OSD_DEVICE}$(sgdisk --print ${OSD_DEVICE} | grep "F800" | awk '{print $1}')
|
|
CEPH_DISK_USED=1
|
|
else
|
|
if dmsetup ls |grep -i ${osd_dev_split}|grep -v "db--dev\|wal--dev"; then
|
|
CEPH_DISK_USED=1
|
|
fi
|
|
if [[ ${OSD_FORCE_REPAIR} -eq 1 ]] && [ ${CEPH_DISK_USED} -ne 1 ]; then
|
|
echo "${OSD_DEVICE} isn't clean, zapping it because OSD_FORCE_REPAIR is enabled"
|
|
disk_zap ${OSD_DEVICE}
|
|
else
|
|
echo "${OSD_DEVICE} isn't clean, but OSD_FORCE_REPAIR isn't enabled."
|
|
echo "Please set OSD_FORCE_REPAIR to '1' if you want to zap this disk."
|
|
exit 1
|
|
fi
|
|
fi
|
|
fi
|
|
if [ ${OSD_FORCE_REPAIR} -eq 1 ] && [ ! -z ${DM_DEV} ]; then
|
|
if [ -b $DM_DEV ]; then
|
|
local cephFSID=$(ceph-conf --lookup fsid)
|
|
if [ ! -z "${cephFSID}" ]; then
|
|
local tmpmnt=$(mktemp -d)
|
|
mount ${DM_DEV} ${tmpmnt}
|
|
if [ "${OSD_BLUESTORE:-0}" -ne 1 ] && [ "x$JOURNAL_TYPE" != "xdirectory" ]; then
|
|
# we only care about journals for filestore.
|
|
if [ -f "${tmpmnt}/whoami" ]; then
|
|
OSD_JOURNAL_DISK=$(readlink -f "${tmpmnt}/journal")
|
|
local osd_id=$(cat "${tmpmnt}/whoami")
|
|
if [ ! -b "${OSD_JOURNAL_DISK}" ]; then
|
|
OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL})
|
|
local jdev=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g')
|
|
if [ ${jdev} == ${OSD_JOURNAL} ]; then
|
|
echo "It appears that ${OSD_DEVICE} is missing the journal at ${OSD_JOURNAL}."
|
|
echo "Because OSD_FORCE_REPAIR is set, we will wipe the metadata of the OSD and zap it."
|
|
rm -rf ${tmpmnt}/ceph_fsid
|
|
else
|
|
echo "It appears that ${OSD_DEVICE} is missing the journal at ${OSD_JOURNAL_DISK}."
|
|
echo "Because OSD_FORCE_REPAIR is set and paritions are manually defined, we will"
|
|
echo "attempt to recreate the missing journal device partitions."
|
|
osd_journal_create ${OSD_JOURNAL}
|
|
ln -sf /dev/disk/by-partuuid/${OSD_JOURNAL_UUID} ${tmpmnt}/journal
|
|
echo ${OSD_JOURNAL_UUID} | tee ${tmpmnt}/journal_uuid
|
|
chown ceph. ${OSD_JOURNAL}
|
|
# During OSD start we will format the journal and set the fsid
|
|
touch ${tmpmnt}/run_mkjournal
|
|
fi
|
|
fi
|
|
else
|
|
echo "It looks like ${OSD_DEVICE} has a ceph data partition but is missing it's metadata."
|
|
echo "The device may contain inconsistent metadata or be corrupted."
|
|
echo "Because OSD_FORCE_REPAIR is set, we will wipe the metadata of the OSD and zap it."
|
|
rm -rf ${tmpmnt}/ceph_fsid
|
|
fi
|
|
fi
|
|
if [ -f "${tmpmnt}/ceph_fsid" ]; then
|
|
osdFSID=$(cat "${tmpmnt}/ceph_fsid")
|
|
if [ ${osdFSID} != ${cephFSID} ]; then
|
|
echo "It looks like ${OSD_DEVICE} is an OSD belonging to a different (or old) ceph cluster."
|
|
echo "The OSD FSID is ${osdFSID} while this cluster is ${cephFSID}"
|
|
echo "Because OSD_FORCE_REPAIR was set, we will zap this device."
|
|
zap_extra_partitions ${tmpmnt}
|
|
umount ${tmpmnt}
|
|
disk_zap ${OSD_DEVICE}
|
|
else
|
|
umount ${tmpmnt}
|
|
echo "It looks like ${OSD_DEVICE} is an OSD belonging to a this ceph cluster."
|
|
echo "OSD_FORCE_REPAIR is set, but will be ignored and the device will not be zapped."
|
|
echo "Moving on, trying to activate the OSD now."
|
|
fi
|
|
else
|
|
echo "It looks like ${OSD_DEVICE} has a ceph data partition but no FSID."
|
|
echo "Because OSD_FORCE_REPAIR was set, we will zap this device."
|
|
zap_extra_partitions ${tmpmnt}
|
|
umount ${tmpmnt}
|
|
disk_zap ${OSD_DEVICE}
|
|
fi
|
|
else
|
|
echo "Unable to determine the FSID of the current cluster."
|
|
echo "OSD_FORCE_REPAIR is set, but this OSD will not be zapped."
|
|
echo "Moving on, trying to activate the OSD now."
|
|
return
|
|
fi
|
|
else
|
|
echo "parted says ${DM_DEV} should exist, but we do not see it."
|
|
echo "We will ignore OSD_FORCE_REPAIR and try to use the device as-is"
|
|
echo "Moving on, trying to activate the OSD now."
|
|
return
|
|
fi
|
|
else
|
|
echo "INFO- It looks like ${OSD_DEVICE} is an OSD LVM"
|
|
echo "Moving on, trying to prepare and activate the OSD LVM now."
|
|
fi
|
|
|
|
if [ "${OSD_BLUESTORE:-0}" -eq 1 ] && [ ${CEPH_DISK_USED} -eq 0 ] ; then
|
|
if [[ ${BLOCK_DB} ]]; then
|
|
block_db_string=$(echo ${BLOCK_DB} | awk -F "/" '{print $2}{print $3}' | paste -s -d'-')
|
|
fi
|
|
if [[ ${BLOCK_WAL} ]]; then
|
|
block_wal_string=$(echo ${BLOCK_WAL} | awk -F "/" '{print $2}{print $3}' | paste -s -d'-')
|
|
fi
|
|
if [[ ${BLOCK_DB} && ${BLOCK_WAL} ]]; then
|
|
prep_device "${BLOCK_DB}" "${BLOCK_DB_SIZE}" "db"
|
|
prep_device "${BLOCK_WAL}" "${BLOCK_WAL_SIZE}" "wal"
|
|
elif [[ -z ${BLOCK_DB} && ${BLOCK_WAL} ]]; then
|
|
prep_device "${BLOCK_WAL}" "${BLOCK_WAL_SIZE}" "wal"
|
|
elif [[ ${BLOCK_DB} && -z ${BLOCK_WAL} ]]; then
|
|
prep_device "${BLOCK_DB}" "${BLOCK_DB_SIZE}" "db"
|
|
fi
|
|
if [ -z ${BLOCK_DB} ] && [ -z ${BLOCK_WAL} ]; then
|
|
if pvdisplay ${OSD_DEVICE} | grep "VG Name" | awk '{print $3}' | grep "ceph"; then
|
|
CEPH_LVM_PREPARE=0
|
|
fi
|
|
fi
|
|
else
|
|
if pvdisplay ${OSD_DEVICE} | grep "VG Name" | awk '{print $3}' | grep "ceph"; then
|
|
CEPH_LVM_PREPARE=0
|
|
fi
|
|
fi
|
|
|
|
if [ "${OSD_BLUESTORE:-0}" -eq 1 ]; then
|
|
CLI_OPTS="${CLI_OPTS} --bluestore"
|
|
|
|
if [ ! -z "$BLOCK_DB" ]; then
|
|
CLI_OPTS="${CLI_OPTS} --block.db ${BLOCK_DB}"
|
|
fi
|
|
|
|
if [ ! -z "$BLOCK_WAL" ]; then
|
|
CLI_OPTS="${CLI_OPTS} --block.wal ${BLOCK_WAL}"
|
|
fi
|
|
else
|
|
# we only care about journals for filestore.
|
|
osd_journal_prepare
|
|
CLI_OPTS="${CLI_OPTS} --data ${OSD_DEVICE} --journal ${OSD_JOURNAL}"
|
|
udev_settle
|
|
fi
|
|
|
|
if [ ! -z "$DEVICE_CLASS" ]; then
|
|
CLI_OPTS="${CLI_OPTS} --crush-device-class ${DEVICE_CLASS}"
|
|
fi
|
|
|
|
if [[ ${CEPH_DISK_USED} -eq 1 ]]; then
|
|
CLI_OPTS="${CLI_OPTS} --data ${OSD_DEVICE}"
|
|
ceph-volume simple scan --force ${OSD_DEVICE}$(sgdisk --print ${OSD_DEVICE} | grep "F800" | awk '{print $1}')
|
|
elif [[ ${CEPH_LVM_PREPARE} -eq 1 ]] || [[ ${DISK_ZAPPED} -eq 1 ]]; then
|
|
udev_settle
|
|
if [[ $(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "ceph-vg-${osd_dev_string}") ]]; then
|
|
OSD_VG=$(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "ceph-vg-${osd_dev_string}")
|
|
else
|
|
vgcreate ceph-vg-${osd_dev_string} ${OSD_DEVICE}
|
|
OSD_VG=ceph-vg-${osd_dev_string}
|
|
fi
|
|
if [[ $(locked lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-lv-${osd_dev_string}") != "ceph-lv-${osd_dev_string}" ]]; then
|
|
lvcreate --yes -l 100%FREE -n ceph-lv-${osd_dev_string} ${OSD_VG}
|
|
fi
|
|
OSD_LV=${OSD_VG}/ceph-lv-${osd_dev_string}
|
|
CLI_OPTS="${CLI_OPTS} --data ${OSD_LV}"
|
|
locked ceph-volume lvm -v prepare ${CLI_OPTS}
|
|
udev_settle
|
|
fi
|
|
}
|
|
|
|
function osd_journal_create {
|
|
local osd_journal=${1}
|
|
local osd_journal_partition=$(echo ${osd_journal} | sed 's/[^0-9]//g')
|
|
local jdev=$(echo ${osd_journal} | sed 's/[0-9]//g')
|
|
if [ -b "${jdev}" ]; then
|
|
sgdisk --new=${osd_journal_partition}:0:+${OSD_JOURNAL_SIZE}M \
|
|
--change-name='${osd_journal_partition}:ceph journal' \
|
|
--partition-guid=${osd_journal_partition}:${OSD_JOURNAL_UUID} \
|
|
--typecode=${osd_journal_partition}:45b0969e-9b03-4f30-b4c6-b4b80ceff106 --mbrtogpt -- ${jdev}
|
|
OSD_JOURNAL=$(dev_part ${jdev} ${osd_journal_partition})
|
|
udev_settle
|
|
else
|
|
echo "The backing device ${jdev} for ${OSD_JOURNAL} does not exist on this system."
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
function osd_journal_prepare {
|
|
if [ -n "${OSD_JOURNAL}" ]; then
|
|
if [ -b ${OSD_JOURNAL} ]; then
|
|
OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL})
|
|
OSD_JOURNAL_PARTITION=$(echo ${OSD_JOURNAL} | sed 's/[^0-9]//g')
|
|
local jdev=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g')
|
|
if [ -z "${OSD_JOURNAL_PARTITION}" ]; then
|
|
OSD_JOURNAL=$(dev_part ${jdev} ${OSD_JOURNAL_PARTITION})
|
|
else
|
|
OSD_JOURNAL=${OSD_JOURNAL}
|
|
fi
|
|
elif [ "x$JOURNAL_TYPE" != "xdirectory" ]; then
|
|
# The block device exists but doesn't appear to be paritioned, we will proceed with parititioning the device.
|
|
OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL})
|
|
until [ -b ${OSD_JOURNAL} ]; do
|
|
osd_journal_create ${OSD_JOURNAL}
|
|
done
|
|
fi
|
|
chown ceph. ${OSD_JOURNAL};
|
|
elif [ "x$JOURNAL_TYPE" != "xdirectory" ]; then
|
|
echo "No journal device specified. OSD and journal will share ${OSD_DEVICE}"
|
|
echo "For better performance on HDD, consider moving your journal to a separate device"
|
|
fi
|
|
CLI_OPTS="${CLI_OPTS} --filestore"
|
|
}
|
|
|
|
if ! [ "x${STORAGE_TYPE%-*}" == "xdirectory" ]; then
|
|
osd_disk_prepare
|
|
fi
|