From 41684a3c2937c4cf8afdc0997b66dc38662fa09d Mon Sep 17 00:00:00 2001 From: kranthikirang Date: Wed, 18 Sep 2019 00:09:15 +0000 Subject: [PATCH] ceph-volume integration to ceph-osd charts ceph-disk has been deprecated and ceph-volume is available from luminous release. uplifting ceph-osd charts to use ceph-volume with support of all below combinations Filestore: ceph-disk to ceph-volume ceph-volume to ceph-volume Bluestore: (including db, wal combinations) ceph-disk to ceph-volume ceph-volume to ceph-volume support for different osds to run different stores and upgrade with db, wal combinations cross upgrade from store isn't supported Story: ceph-volume-support Signed-off-by: Kranthi Guttikonda Co-Authored-By: Chinasubbareddy Mallavarapu Change-Id: Id8b2e1bda0d35fef2cffed6a5ca5876f3888a1c7 --- .../bin/osd/ceph-volume/_block.sh.tpl | 151 ++++++ .../bin/osd/ceph-volume/_bluestore.sh.tpl | 112 +++++ .../bin/osd/ceph-volume/_common.sh.tpl | 251 ++++++++++ .../ceph-volume/_init-with-ceph-volume.sh.tpl | 438 ++++++++++++++++++ ceph-osd/templates/configmap-bin.yaml | 12 +- ceph-osd/templates/daemonset-osd.yaml | 41 ++ .../utils/_osd_daemonset_overrides.tpl | 12 +- ceph-osd/values.yaml | 22 +- 8 files changed, 1027 insertions(+), 12 deletions(-) create mode 100644 ceph-osd/templates/bin/osd/ceph-volume/_block.sh.tpl create mode 100644 ceph-osd/templates/bin/osd/ceph-volume/_bluestore.sh.tpl create mode 100644 ceph-osd/templates/bin/osd/ceph-volume/_common.sh.tpl create mode 100644 ceph-osd/templates/bin/osd/ceph-volume/_init-with-ceph-volume.sh.tpl diff --git a/ceph-osd/templates/bin/osd/ceph-volume/_block.sh.tpl b/ceph-osd/templates/bin/osd/ceph-volume/_block.sh.tpl new file mode 100644 index 000000000..bc657ec01 --- /dev/null +++ b/ceph-osd/templates/bin/osd/ceph-volume/_block.sh.tpl @@ -0,0 +1,151 @@ +#!/bin/bash + +{{/* +Copyright 2017 The Openstack-Helm Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} + +source /tmp/osd-common-ceph-volume.sh + +set -ex + +: "${OSD_SOFT_FORCE_ZAP:=1}" +: "${OSD_JOURNAL_DISK:=}" + +if [ "x${STORAGE_TYPE%-*}" == "xdirectory" ]; then + export OSD_DEVICE="/var/lib/ceph/osd" +else + export OSD_DEVICE=$(readlink -f ${STORAGE_LOCATION}) +fi + +if [ "x$JOURNAL_TYPE" == "xdirectory" ]; then + export OSD_JOURNAL="/var/lib/ceph/journal" +else + export OSD_JOURNAL=$(readlink -f ${JOURNAL_LOCATION}) +fi + +if [[ -z "${OSD_DEVICE}" ]];then + echo "ERROR- You must provide a device to build your OSD ie: /dev/sdb" + exit 1 +fi + +if [[ ! -b "${OSD_DEVICE}" ]]; then + echo "ERROR- The device pointed by OSD_DEVICE ${OSD_DEVICE} doesn't exist !" + exit 1 +fi + +CEPH_DISK_OPTIONS="" +CEPH_OSD_OPTIONS="" + +udev_settle + +OSD_ID=$(ceph-volume inventory ${OSD_DEVICE} | grep "osd id" | awk '{print $3}') +simple_activate=0 +if [[ -z ${OSD_ID} ]]; then + echo "Looks like ceph-disk has been used earlier to activate the OSD." + tmpmnt=$(mktemp -d) + mount ${OSD_DEVICE}1 ${tmpmnt} + OSD_ID=$(cat ${tmpmnt}/whoami) + umount ${tmpmnt} + simple_activate=1 +fi +OSD_FSID=$(ceph-volume inventory ${OSD_DEVICE} | grep "osd fsid" | awk '{print $3}') +if [[ -z ${OSD_FSID} ]]; then + echo "Looks like ceph-disk has been used earlier to activate the OSD." + tmpmnt=$(mktemp -d) + mount ${OSD_DEVICE}1 ${tmpmnt} + OSD_FSID=$(cat ${tmpmnt}/fsid) + umount ${tmpmnt} + simple_activate=1 +fi +OSD_PATH="${OSD_PATH_BASE}-${OSD_ID}" +OSD_KEYRING="${OSD_PATH}/keyring" + +mkdir -p ${OSD_PATH} + +if [[ ${simple_activate} -eq 1 ]]; then + ceph-volume simple activate --no-systemd ${OSD_ID} ${OSD_FSID} +else + ceph-volume lvm -v \ + --setuser ceph \ + --setgroup disk \ + activate ${CEPH_DISK_OPTIONS} \ + --auto-detect-objectstore \ + --no-systemd ${OSD_ID} ${OSD_FSID} +fi + +# NOTE(supamatt): set the initial crush weight of the OSD to 0 to prevent automatic rebalancing +OSD_WEIGHT=0 +# NOTE(supamatt): add or move the OSD's CRUSH location +crush_location + +if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then + if [ -n "${OSD_JOURNAL}" ]; then + if [ -b "${OSD_JOURNAL}" ]; then + OSD_JOURNAL_DISK="$(readlink -f ${OSD_PATH}/journal)" + if [ -z "${OSD_JOURNAL_DISK}" ]; then + echo "ERROR: Unable to find journal device ${OSD_JOURNAL_DISK}" + exit 1 + else + OSD_JOURNAL="${OSD_JOURNAL_DISK}" + if [ -e "${OSD_PATH}/run_mkjournal" ]; then + ceph-osd -i ${OSD_ID} --mkjournal + rm -rf ${OSD_PATH}/run_mkjournal + fi + fi + fi + if [ "x${JOURNAL_TYPE}" == "xdirectory" ]; then + OSD_JOURNAL="${OSD_JOURNAL}/journal.${OSD_ID}" + touch ${OSD_JOURNAL} + wait_for_file "${OSD_JOURNAL}" + else + if [ ! -b "${OSD_JOURNAL}" ]; then + echo "ERROR: Unable to find journal device ${OSD_JOURNAL}" + exit 1 + else + chown ceph. "${OSD_JOURNAL}" + fi + fi + else + wait_for_file "${OSD_JOURNAL}" + chown ceph. "${OSD_JOURNAL}" + fi +fi + +# NOTE(supamatt): Just in case permissions do not align up, we recursively set them correctly. +if [ $(stat -c%U ${OSD_PATH}) != ceph ]; then + chown -R ceph. ${OSD_PATH}; +fi + +if [ "x${JOURNAL_TYPE}" == "xdirectory" ]; then + chown -R ceph. /var/lib/ceph/journal + ceph-osd \ + --cluster ceph \ + --osd-data ${OSD_PATH} \ + --osd-journal ${OSD_JOURNAL} \ + -f \ + -i ${OSD_ID} \ + --setuser ceph \ + --setgroup disk \ + --mkjournal +fi + +exec /usr/bin/ceph-osd \ + --cluster ${CLUSTER} \ + ${CEPH_OSD_OPTIONS} \ + -f \ + -i ${OSD_ID} \ + --setuser ceph \ + --setgroup disk & echo $! > /run/ceph-osd.pid +wait diff --git a/ceph-osd/templates/bin/osd/ceph-volume/_bluestore.sh.tpl b/ceph-osd/templates/bin/osd/ceph-volume/_bluestore.sh.tpl new file mode 100644 index 000000000..54686f8af --- /dev/null +++ b/ceph-osd/templates/bin/osd/ceph-volume/_bluestore.sh.tpl @@ -0,0 +1,112 @@ +#!/bin/bash + +{{/* +Copyright 2017 The Openstack-Helm Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} + +source /tmp/osd-common-ceph-volume.sh + +set -ex + +: "${OSD_SOFT_FORCE_ZAP:=1}" + +export OSD_DEVICE=$(readlink -f ${STORAGE_LOCATION}) + +if [[ -z "${OSD_DEVICE}" ]];then + echo "ERROR- You must provide a device to build your OSD ie: /dev/sdb" + exit 1 +fi + +if [[ ! -b "${OSD_DEVICE}" ]]; then + echo "ERROR- The device pointed by OSD_DEVICE ${OSD_DEVICE} doesn't exist !" + exit 1 +fi + +CEPH_DISK_OPTIONS="" +CEPH_OSD_OPTIONS="" + +udev_settle + +OSD_ID=$(ceph-volume inventory ${OSD_DEVICE} | grep "osd id" | awk '{print $3}') +simple_activate=0 +if [[ -z ${OSD_ID} ]]; then + echo "Looks like ceph-disk has been used earlier to activate the OSD." + tmpmnt=$(mktemp -d) + mount ${OSD_DEVICE}1 ${tmpmnt} + OSD_ID=$(cat ${tmpmnt}/whoami) + umount ${tmpmnt} + simple_activate=1 +fi +OSD_FSID=$(ceph-volume inventory ${OSD_DEVICE} | grep "osd fsid" | awk '{print $3}') +if [[ -z ${OSD_FSID} ]]; then + echo "Looks like ceph-disk has been used earlier to activate the OSD." + tmpmnt=$(mktemp -d) + mount ${OSD_DEVICE}1 ${tmpmnt} + OSD_FSID=$(cat ${tmpmnt}/fsid) + umount ${tmpmnt} + simple_activate=1 +fi +OSD_PATH="${OSD_PATH_BASE}-${OSD_ID}" +OSD_KEYRING="${OSD_PATH}/keyring" + +mkdir -p ${OSD_PATH} + +if [[ ${simple_activate} -eq 1 ]]; then + ceph-volume simple activate --no-systemd ${OSD_ID} ${OSD_FSID} +else + ceph-volume lvm -v \ + --setuser ceph \ + --setgroup disk \ + activate ${CEPH_DISK_OPTIONS} \ + --auto-detect-objectstore \ + --no-systemd ${OSD_ID} ${OSD_FSID} + # Cross check the db and wal symlinks if missed + DB_DEV=$(ceph-volume lvm list ${OSD_DEVICE} | grep "db device" | awk '{print $3}') + if [[ ! -z ${DB_DEV} ]]; then + if [[ ! -h /var/lib/ceph/osd/ceph-${OSD_ID}/block.db ]]; then + ln -snf ${DB_DEV} /var/lib/ceph/osd/ceph-${OSD_ID}/block.db + chown -h ceph:ceph ${DB_DEV} + chown -h ceph:ceph /var/lib/ceph/osd/ceph-${OSD_ID}/block.db + fi + fi + WAL_DEV=$(ceph-volume lvm list ${OSD_DEVICE} | grep "wal device" | awk '{print $3}') + if [[ ! -z ${WAL_DEV} ]]; then + if [[ ! -h /var/lib/ceph/osd/ceph-${OSD_ID}/block.wal ]]; then + ln -snf ${WAL_DEV} /var/lib/ceph/osd/ceph-${OSD_ID}/block.wal + chown -h ceph:ceph ${WAL_DEV} + chown -h ceph:ceph /var/lib/ceph/osd/ceph-${OSD_ID}/block.wal + fi + fi +fi + +# NOTE(supamatt): set the initial crush weight of the OSD to 0 to prevent automatic rebalancing +OSD_WEIGHT=0 +# NOTE(supamatt): add or move the OSD's CRUSH location +crush_location + + +# NOTE(supamatt): Just in case permissions do not align up, we recursively set them correctly. +if [ $(stat -c%U ${OSD_PATH}) != ceph ]; then + chown -R ceph. ${OSD_PATH}; +fi + +exec /usr/bin/ceph-osd \ + --cluster ${CLUSTER} \ + ${CEPH_OSD_OPTIONS} \ + -f \ + -i ${OSD_ID} \ + --setuser ceph \ + --setgroup disk & echo $! > /run/ceph-osd.pid +wait diff --git a/ceph-osd/templates/bin/osd/ceph-volume/_common.sh.tpl b/ceph-osd/templates/bin/osd/ceph-volume/_common.sh.tpl new file mode 100644 index 000000000..f27a3e91d --- /dev/null +++ b/ceph-osd/templates/bin/osd/ceph-volume/_common.sh.tpl @@ -0,0 +1,251 @@ +#!/bin/bash + +{{/* +Copyright 2017 The Openstack-Helm Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} + +set -ex + +: "${CRUSH_LOCATION:=root=default host=${HOSTNAME}}" +: "${OSD_PATH_BASE:=/var/lib/ceph/osd/${CLUSTER}}" +: "${CEPH_CONF:="/etc/ceph/${CLUSTER}.conf"}" +: "${OSD_BOOTSTRAP_KEYRING:=/var/lib/ceph/bootstrap-osd/${CLUSTER}.keyring}" +: "${OSD_JOURNAL_UUID:=$(uuidgen)}" +: "${OSD_JOURNAL_SIZE:=$(awk '/^osd_journal_size/{print $3}' ${CEPH_CONF}.template)}" +: "${OSD_WEIGHT:=1.0}" + +eval CRUSH_FAILURE_DOMAIN_TYPE=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain"]))') +eval CRUSH_FAILURE_DOMAIN_NAME=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain_name"]))') +eval CRUSH_FAILURE_DOMAIN_BY_HOSTNAME=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain_by_hostname"]))') + +if [[ $(ceph -v | egrep -q "mimic|luminous"; echo $?) -ne 0 ]]; then + echo "ERROR- need Luminous/Mimic release" + exit 1 +fi + +if [ -z "${HOSTNAME}" ]; then + echo "HOSTNAME not set; This will prevent to add an OSD into the CRUSH map" + exit 1 +fi + +if [[ ! -e ${CEPH_CONF}.template ]]; then + echo "ERROR- ${CEPH_CONF}.template must exist; get it from your existing mon" + exit 1 +else + ENDPOINT=$(kubectl get endpoints ceph-mon -n ${NAMESPACE} -o json | awk -F'"' -v port=${MON_PORT} '/"ip"/{print $4":"port}' | paste -sd',') + if [[ ${ENDPOINT} == "" ]]; then + # No endpoints are available, just copy ceph.conf as-is + /bin/sh -c -e "cat ${CEPH_CONF}.template | tee ${CEPH_CONF}" || true + else + /bin/sh -c -e "cat ${CEPH_CONF}.template | sed 's/mon_host.*/mon_host = ${ENDPOINT}/g' | tee ${CEPH_CONF}" || true + fi +fi + +# Wait for a file to exist, regardless of the type +function wait_for_file { + timeout 10 bash -c "while [ ! -e ${1} ]; do echo 'Waiting for ${1} to show up' && sleep 1 ; done" +} + +function is_available { + command -v $@ &>/dev/null +} + +function ceph_cmd_retry() { + cnt=0 + until "ceph" "$@" || [ $cnt -ge 6 ]; do + sleep 10 + ((cnt++)) + done +} + +function crush_create_or_move { + local crush_location=${1} + ceph_cmd_retry --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ + osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${crush_location} +} + +function crush_add_and_move { + local crush_failure_domain_type=${1} + local crush_failure_domain_name=${2} + local crush_location=$(echo "root=default ${crush_failure_domain_type}=${crush_failure_domain_name} host=${HOSTNAME}") + crush_create_or_move "${crush_location}" + local crush_failure_domain_location_check=$(ceph_cmd_retry --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" osd find ${OSD_ID} | grep "${crush_failure_domain_type}" | awk -F '"' '{print $4}') + if [ "x${crush_failure_domain_location_check}" != "x${crush_failure_domain_name}" ]; then + # NOTE(supamatt): Manually move the buckets for previously configured CRUSH configurations + # as create-or-move may not appropiately move them. + ceph_cmd_retry --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ + osd crush add-bucket "${crush_failure_domain_name}" "${crush_failure_domain_type}" || true + ceph_cmd_retry --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ + osd crush move "${crush_failure_domain_name}" root=default || true + ceph_cmd_retry --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ + osd crush move "${HOSTNAME}" "${crush_failure_domain_type}=${crush_failure_domain_name}" || true + fi +} + +function crush_location { + if [ "x${CRUSH_FAILURE_DOMAIN_TYPE}" != "xhost" ]; then + if [ "x${CRUSH_FAILURE_DOMAIN_NAME}" != "xfalse" ]; then + crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "${CRUSH_FAILURE_DOMAIN_NAME}" + elif [ "x${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}" != "xfalse" ]; then + crush_add_and_move "${CRUSH_FAILURE_DOMAIN_TYPE}" "$(echo ${CRUSH_FAILURE_DOMAIN_TYPE}_$(echo ${HOSTNAME} | cut -c ${CRUSH_FAILURE_DOMAIN_BY_HOSTNAME}))" + else + # NOTE(supamatt): neither variables are defined then we fall back to default behavior + crush_create_or_move "${CRUSH_LOCATION}" + fi + else + crush_create_or_move "${CRUSH_LOCATION}" + fi +} + +# Calculate proper device names, given a device and partition number +function dev_part { + local osd_device=${1} + local osd_partition=${2} + + if [[ -L ${osd_device} ]]; then + # This device is a symlink. Work out it's actual device + local actual_device=$(readlink -f "${osd_device}") + local bn=$(basename "${osd_device}") + if [[ "${actual_device:0-1:1}" == [0-9] ]]; then + local desired_partition="${actual_device}p${osd_partition}" + else + local desired_partition="${actual_device}${osd_partition}" + fi + # Now search for a symlink in the directory of $osd_device + # that has the correct desired partition, and the longest + # shared prefix with the original symlink + local symdir=$(dirname "${osd_device}") + local link="" + local pfxlen=0 + for option in ${symdir}/*; do + [[ -e $option ]] || break + if [[ $(readlink -f "${option}") == "${desired_partition}" ]]; then + local optprefixlen=$(prefix_length "${option}" "${bn}") + if [[ ${optprefixlen} > ${pfxlen} ]]; then + link=${symdir}/${option} + pfxlen=${optprefixlen} + fi + fi + done + if [[ $pfxlen -eq 0 ]]; then + >&2 echo "Could not locate appropriate symlink for partition ${osd_partition} of ${osd_device}" + exit 1 + fi + echo "$link" + elif [[ "${osd_device:0-1:1}" == [0-9] ]]; then + echo "${osd_device}p${osd_partition}" + else + echo "${osd_device}${osd_partition}" + fi +} + +function zap_extra_partitions { + # Examine temp mount and delete any block.db and block.wal partitions + mountpoint=${1} + journal_disk="" + journal_part="" + block_db_disk="" + block_db_part="" + block_wal_disk="" + block_wal_part="" + + # Discover journal, block.db, and block.wal partitions first before deleting anything + # If the partitions are on the same disk, deleting one can affect discovery of the other(s) + if [ -L "${mountpoint}/journal" ]; then + journal_disk=$(readlink -m ${mountpoint}/journal | sed 's/[0-9]*//g') + journal_part=$(readlink -m ${mountpoint}/journal | sed 's/[^0-9]*//g') + fi + if [ -L "${mountpoint}/block.db" ]; then + block_db_disk=$(readlink -m ${mountpoint}/block.db | sed 's/[0-9]*//g') + block_db_part=$(readlink -m ${mountpoint}/block.db | sed 's/[^0-9]*//g') + fi + if [ -L "${mountpoint}/block.wal" ]; then + block_wal_disk=$(readlink -m ${mountpoint}/block.wal | sed 's/[0-9]*//g') + block_wal_part=$(readlink -m ${mountpoint}/block.wal | sed 's/[^0-9]*//g') + fi + + # Delete any discovered journal, block.db, and block.wal partitions + if [ ! -z "${journal_disk}" ]; then + sgdisk -d ${journal_part} ${journal_disk} + /sbin/udevadm settle --timeout=600 + /usr/bin/flock -s ${journal_disk} /sbin/partprobe ${journal_disk} + /sbin/udevadm settle --timeout=600 + fi + if [ ! -z "${block_db_disk}" ]; then + sgdisk -d ${block_db_part} ${block_db_disk} + /sbin/udevadm settle --timeout=600 + /usr/bin/flock -s ${block_db_disk} /sbin/partprobe ${block_db_disk} + /sbin/udevadm settle --timeout=600 + fi + if [ ! -z "${block_wal_disk}" ]; then + sgdisk -d ${block_wal_part} ${block_wal_disk} + /sbin/udevadm settle --timeout=600 + /usr/bin/flock -s ${block_wal_disk} /sbin/partprobe ${block_wal_disk} + /sbin/udevadm settle --timeout=600 + fi +} + +function disk_zap { + # Run all the commands that ceph-disk zap uses to clear a disk + local device=${1} + local osd_device_lvm=$(lsblk ${device} -o name,type -l | grep "lvm" | grep "ceph"| awk '{print $1}') + if [[ ! -z ${osd_device_lvm} ]]; then + dmsetup remove ${osd_device_lvm} + fi + if [[ $(pvdisplay ${OSD_DEVICE} | grep "VG Name" | awk '{print $3}' | grep "ceph") ]]; then + local LOCAL_VG=$(pvdisplay ${OSD_DEVICE} | grep "VG Name" | awk '{print $3}' | grep "ceph") + if [[ $(lvdisplay | grep ${LOCAL_VG} | grep "LV Path" | awk '{print $3}') ]]; then + echo "y" | lvremove $(lvdisplay | grep ${LOCAL_VG} | grep "LV Path" | awk '{print $3}') + fi + vgremove ${LOCAL_VG} + pvremove ${OSD_DEVICE} + ceph-volume lvm zap ${device} --destroy + fi + wipefs --all ${device} + # Wipe the first 200MB boundary, as Bluestore redeployments will not work otherwise + dd if=/dev/zero of=${device} bs=1M count=200 + sgdisk --zap-all -- ${device} +} + +function udev_settle { + partprobe "${OSD_DEVICE}" + if [ "${OSD_BLUESTORE:-0}" -eq 1 ]; then + if [ ! -z "$BLOCK_DB" ]; then + partprobe "${BLOCK_DB}" + fi + if [ ! -z "$BLOCK_WAL" ] && [ "$BLOCK_WAL" != "$BLOCK_DB" ]; then + partprobe "${BLOCK_WAL}" + fi + else + if [ "x$JOURNAL_TYPE" == "xblock-logical" ] && [ ! -z "$OSD_JOURNAL" ]; then + OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL}) + if [ ! -z "$OSD_JOURNAL" ]; then + local JDEV=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g') + partprobe "${JDEV}" + fi + fi + fi + # watch the udev event queue, and exit if all current events are handled + udevadm settle --timeout=600 + + # On occassion udev may not make the correct device symlinks for Ceph, just in case we make them manually + mkdir -p /dev/disk/by-partuuid + for dev in $(awk '!/rbd/{print $4}' /proc/partitions | grep "[0-9]"); do + diskdev=$(echo "${dev//[!a-z]/}") + partnum=$(echo "${dev//[!0-9]/}") + ln -s "../../${dev}" "/dev/disk/by-partuuid/$(sgdisk -i ${partnum} /dev/${diskdev} | awk '/Partition unique GUID/{print tolower($4)}')" || true + done +} + diff --git a/ceph-osd/templates/bin/osd/ceph-volume/_init-with-ceph-volume.sh.tpl b/ceph-osd/templates/bin/osd/ceph-volume/_init-with-ceph-volume.sh.tpl new file mode 100644 index 000000000..e1c516010 --- /dev/null +++ b/ceph-osd/templates/bin/osd/ceph-volume/_init-with-ceph-volume.sh.tpl @@ -0,0 +1,438 @@ +#!/bin/bash + +{{/* +Copyright 2017 The Openstack-Helm Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} + +set -ex + +source /tmp/osd-common-ceph-volume.sh + +: "${OSD_FORCE_REPAIR:=1}" +# We do not want to zap journal disk. Tracking this option seperatly. +: "${JOURNAL_FORCE_ZAP:=0}" + +if [ "x${STORAGE_TYPE%-*}" == "xbluestore" ]; then + export OSD_BLUESTORE=1 +fi + +if [ "x${STORAGE_TYPE%-*}" == "xdirectory" ]; then + export OSD_DEVICE="/var/lib/ceph/osd" +else + export OSD_DEVICE=$(readlink -f ${STORAGE_LOCATION}) +fi + +if [ "x$JOURNAL_TYPE" == "xdirectory" ]; then + export OSD_JOURNAL="/var/lib/ceph/journal" +else + export OSD_JOURNAL=$(readlink -f ${JOURNAL_LOCATION}) +fi + +function osd_disk_prepare { + if [[ -z "${OSD_DEVICE}" ]];then + echo "ERROR- You must provide a device to build your OSD ie: /dev/sdb" + exit 1 + fi + + if [[ ! -b "${OSD_DEVICE}" ]]; then + echo "ERROR- The device pointed by OSD_DEVICE ($OSD_DEVICE) doesn't exist !" + exit 1 + fi + + if [ ! -e $OSD_BOOTSTRAP_KEYRING ]; then + echo "ERROR- $OSD_BOOTSTRAP_KEYRING must exist. You can extract it from your current monitor by running 'ceph auth get client.bootstrap-osd -o $OSD_BOOTSTRAP_KEYRING'" + exit 1 + fi + timeout 10 ceph ${CLI_OPTS} --name client.bootstrap-osd --keyring $OSD_BOOTSTRAP_KEYRING health || exit 1 + + #search for some ceph metadata on the disk based on the status of the disk/lvm in filestore + CEPH_DISK_USED=0 + CEPH_LVM_PREPARE=1 + osd_dev_string=$(echo ${OSD_DEVICE} | awk -F "/" '{print $2}{print $3}' | paste -s -d'-') + OSD_ID=$(ceph-volume inventory ${OSD_DEVICE} | grep "osd id" | awk '{print $3}') + if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then + if [[ ! -z ${OSD_ID} ]]; then + DM_NUM=$(dmsetup ls | grep $(lsblk -J ${OSD_DEVICE} | jq -r '.blockdevices[].children[].name') | awk '{print $2}' | cut -d':' -f2 | cut -d')' -f1) + DM_DEV="/dev/dm-"${DM_NUM} + elif [[ $(sgdisk --print ${OSD_DEVICE} | grep "F800") ]]; then + DM_DEV=${OSD_DEVICE}$(sgdisk --print ${OSD_DEVICE} | grep "F800" | awk '{print $1}') + CEPH_DISK_USED=1 + else + if [[ ${OSD_FORCE_REPAIR} -eq 1 ]]; then + echo "It looks like ${OSD_DEVICE} isn't consistent, however OSD_FORCE_REPAIR is enabled so we are zapping the device anyway" + disk_zap ${OSD_DEVICE} + else + echo "Regarding parted, device ${OSD_DEVICE} is inconsistent/broken/weird." + echo "It would be too dangerous to destroy it without any notification." + echo "Please set OSD_FORCE_REPAIR to '1' if you really want to zap this disk." + exit 1 + fi + fi + else + if [[ ! -z ${OSD_ID} ]]; then + echo "Running bluestore mode and ${OSD_DEVICE} already bootstrapped" + elif [[ $(sgdisk --print ${OSD_DEVICE} | grep "F800") ]]; then + DM_DEV=${OSD_DEVICE}$(sgdisk --print ${OSD_DEVICE} | grep "F800" | awk '{print $1}') + CEPH_DISK_USED=1 + else + if [[ ${OSD_FORCE_REPAIR} -eq 1 ]]; then + echo "It looks like ${OSD_DEVICE} isn't consistent, however OSD_FORCE_REPAIR is enabled so we are zapping the device anyway" + disk_zap ${OSD_DEVICE} + else + echo "Regarding parted, device ${OSD_DEVICE} is inconsistent/broken/weird." + echo "It would be too dangerous to destroy it without any notification." + echo "Please set OSD_FORCE_REPAIR to '1' if you really want to zap this disk." + exit 1 + fi + fi + fi + if [ ${OSD_FORCE_REPAIR} -eq 1 ] && [ ! -z ${DM_DEV} ]; then + if [ -b $DM_DEV ]; then + local cephFSID=$(ceph-conf --lookup fsid) + if [ ! -z "${cephFSID}" ]; then + local tmpmnt=$(mktemp -d) + mount ${DM_DEV} ${tmpmnt} + if [ "${OSD_BLUESTORE:-0}" -ne 1 ] && [ "x$JOURNAL_TYPE" != "xdirectory" ]; then + # we only care about journals for filestore. + if [ -f "${tmpmnt}/whoami" ]; then + OSD_JOURNAL_DISK=$(readlink -f "${tmpmnt}/journal") + local osd_id=$(cat "${tmpmnt}/whoami") + if [ ! -b "${OSD_JOURNAL_DISK}" ]; then + OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL}) + local jdev=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g') + if [ ${jdev} == ${OSD_JOURNAL} ]; then + echo "It appears that ${OSD_DEVICE} is missing the journal at ${OSD_JOURNAL}." + echo "Because OSD_FORCE_REPAIR is set, we will wipe the metadata of the OSD and zap it." + rm -rf ${tmpmnt}/ceph_fsid + else + echo "It appears that ${OSD_DEVICE} is missing the journal at ${OSD_JOURNAL_DISK}." + echo "Because OSD_FORCE_REPAIR is set and paritions are manually defined, we will" + echo "attempt to recreate the missing journal device partitions." + osd_journal_create ${OSD_JOURNAL} + ln -sf /dev/disk/by-partuuid/${OSD_JOURNAL_UUID} ${tmpmnt}/journal + echo ${OSD_JOURNAL_UUID} | tee ${tmpmnt}/journal_uuid + chown ceph. ${OSD_JOURNAL} + # During OSD start we will format the journal and set the fsid + touch ${tmpmnt}/run_mkjournal + fi + fi + else + echo "It looks like ${OSD_DEVICE} has a ceph data partition but is missing it's metadata." + echo "The device may contain inconsistent metadata or be corrupted." + echo "Because OSD_FORCE_REPAIR is set, we will wipe the metadata of the OSD and zap it." + rm -rf ${tmpmnt}/ceph_fsid + fi + fi + if [ -f "${tmpmnt}/ceph_fsid" ]; then + osdFSID=$(cat "${tmpmnt}/ceph_fsid") + if [ ${osdFSID} != ${cephFSID} ]; then + echo "It looks like ${OSD_DEVICE} is an OSD belonging to a different (or old) ceph cluster." + echo "The OSD FSID is ${osdFSID} while this cluster is ${cephFSID}" + echo "Because OSD_FORCE_REPAIR was set, we will zap this device." + zap_extra_partitions ${tmpmnt} + umount ${tmpmnt} + disk_zap ${OSD_DEVICE} + else + umount ${tmpmnt} + echo "It looks like ${OSD_DEVICE} is an OSD belonging to a this ceph cluster." + echo "OSD_FORCE_REPAIR is set, but will be ignored and the device will not be zapped." + echo "Moving on, trying to activate the OSD now." + fi + else + echo "It looks like ${OSD_DEVICE} has a ceph data partition but no FSID." + echo "Because OSD_FORCE_REPAIR was set, we will zap this device." + zap_extra_partitions ${tmpmnt} + umount ${tmpmnt} + disk_zap ${OSD_DEVICE} + fi + else + echo "Unable to determine the FSID of the current cluster." + echo "OSD_FORCE_REPAIR is set, but this OSD will not be zapped." + echo "Moving on, trying to activate the OSD now." + return + fi + else + echo "parted says ${DM_DEV} should exist, but we do not see it." + echo "We will ignore OSD_FORCE_REPAIR and try to use the device as-is" + echo "Moving on, trying to activate the OSD now." + return + fi + else + echo "INFO- It looks like ${OSD_DEVICE} is an OSD LVM" + echo "Moving on, trying to prepare and activate the OSD LVM now." + fi + + if [ "${OSD_BLUESTORE:-0}" -eq 1 ] && [ ${CEPH_DISK_USED} -eq 0 ] ; then + if [[ ${BLOCK_DB} ]]; then + block_db_string=$(echo ${BLOCK_DB} | awk -F "/" '{print $2}{print $3}' | paste -s -d'-') + fi + if [[ ${BLOCK_WAL} ]]; then + block_wal_string=$(echo ${BLOCK_WAL} | awk -F "/" '{print $2}{print $3}' | paste -s -d'-') + fi + exec {lock_fd}>/var/lib/ceph/tmp/init-osd.lock || exit 1 + flock -w 60 -E 0 --verbose "${lock_fd}" + if [[ ${BLOCK_DB} && ${BLOCK_WAL} ]]; then + if [[ ${block_db_string} == ${block_wal_string} ]]; then + if [[ $(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}") ]]; then + VG=$(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}") + WAL_OSD_ID=$(ceph-volume lvm list /dev/ceph-db-wal-${block_db_string}/ceph-db-${osd_dev_string} | grep "osd id" | awk '{print $3}') + DB_OSD_ID=$(ceph-volume lvm list /dev/ceph-db-wal-${block_db_string}/ceph-db-${osd_dev_string} | grep "osd id" | awk '{print $3}') + if [ ! -z ${OSD_ID} ] && ([ ${WAL_OSD_ID} != ${OSD_ID} ] || [ ${DB_OSD_ID} != ${OSD_ID} ]); then + echo "Found VG, but corresponding DB || WAL are not, zapping the ${OSD_DEVICE}" + disk_zap ${OSD_DEVICE} + CEPH_LVM_PREPARE=1 + elif [ ! -z ${OSD_ID} ] && ([ -z ${WAL_OSD_ID} ] || [ -z ${DB_OSD_ID} ]); then + disk_zap ${OSD_DEVICE} + CEPH_LVM_PREPARE=1 + elif [ -z ${OSD_ID} ]; then + CEPH_LVM_PREPARE=1 + else + CEPH_LVM_PREPARE=0 + fi + else + osd_dev_split=$(echo ${OSD_DEVICE} | awk -F "/" '{print $3}') + if [[ ! -z $(lsblk ${BLOCK_DB} -o name,type -l | grep "lvm" | grep "ceph"| awk '{print $1}' | grep ${osd_dev_split}) ]]; then + echo "dmsetup reference found but disks mismatch, removing all dmsetup references for ${BLOCK_DB}" + for item in $(lsblk ${BLOCK_DB} -o name,type -l | grep "lvm" | grep "ceph"| awk '{print $1}'); + do + dmsetup remove ${item} + done + disk_zap ${OSD_DEVICE} + CEPH_LVM_PREPARE=1 + fi + vgcreate ceph-db-wal-${block_db_string} ${BLOCK_DB} + VG=ceph-db-wal-${block_db_string} + fi + if [[ $(lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-db-${osd_dev_string}") != "ceph-db-${osd_dev_string}" ]]; then + lvcreate -L ${BLOCK_DB_SIZE} -n ceph-db-${osd_dev_string} ${VG} + fi + BLOCK_DB=${VG}/ceph-db-${osd_dev_string} + if [[ $(lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-wal-${osd_dev_string}") != "ceph-wal-${osd_dev_string}" ]]; then + lvcreate -L ${BLOCK_WAL_SIZE} -n ceph-wal-${osd_dev_string} ${VG} + fi + BLOCK_WAL=${VG}/ceph-wal-${osd_dev_string} + else + if [[ $(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}") ]]; then + VG=$(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}") + DB_OSD_ID=$(ceph-volume lvm list /dev/ceph-db-wal-${block_db_string}/ceph-db-${block_db_string} | grep "osd id" | awk '{print $3}') + if [ ! -z ${OSD_ID} ] && [ ${DB_OSD_ID} != ${OSD_ID} ]; then + echo "Found VG, but corresponding DB is not, zapping the ${OSD_DEVICE}" + disk_zap ${OSD_DEVICE} + CEPH_LVM_PREPARE=1 + elif [ ! -z ${OSD_ID} ] && [ -z ${DB_OSD_ID} ]; then + disk_zap ${OSD_DEVICE} + CEPH_LVM_PREPARE=1 + elif [ -z ${OSD_ID} ]; then + CEPH_LVM_PREPARE=1 + else + CEPH_LVM_PREPARE=0 + fi + else + osd_dev_split=$(echo ${OSD_DEVICE} | awk -F "/" '{print $3}') + if [[ ! -z $(lsblk ${BLOCK_DB} -o name,type -l | grep "lvm" | grep "ceph"| awk '{print $1}' | grep ${osd_dev_split}) ]]; then + echo "dmsetup reference found but disks mismatch" + dmsetup remove $(lsblk ${BLOCK_DB} -o name,type -l | grep "lvm" | grep "ceph"| awk '{print $1}' | grep ${osd_dev_split}) + disk_zap ${OSD_DEVICE} + CEPH_LVM_PREPARE=1 + fi + vgcreate ceph-db-wal-${block_db_string} ${BLOCK_DB} + VG=ceph-db-wal-${block_db_string} + fi + if [[ $(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_wal_string}") ]]; then + VG=$(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_wal_string}") + WAL_OSD_ID=$(ceph-volume lvm list /dev/ceph-db-wal-${block_wal_string}/ceph-wal-${block_wal_string} | grep "osd id" | awk '{print $3}') + if [ ! -z ${OSD_ID} ] && [ ${WAL_OSD_ID} != ${OSD_ID} ]; then + echo "Found VG, but corresponding WAL is not, zapping the ${OSD_DEVICE}" + disk_zap ${OSD_DEVICE} + CEPH_LVM_PREPARE=1 + elif [ ! -z ${OSD_ID} ] && [ -z ${WAL_OSD_ID} ]; then + disk_zap ${OSD_DEVICE} + CEPH_LVM_PREPARE=1 + elif [ -z ${OSD_ID} ]; then + CEPH_LVM_PREPARE=1 + else + CEPH_LVM_PREPARE=0 + fi + else + osd_dev_split=$(echo ${OSD_DEVICE} | awk -F "/" '{print $3}') + if [[ ! -z $(lsblk ${BLOCK_WAL} -o name,type -l | grep "lvm" | grep "ceph"| awk '{print $1}' | grep ${osd_dev_split}) ]]; then + echo "dmsetup reference found but disks mismatch" + dmsetup remove $(lsblk ${BLOCK_WAL} -o name,type -l | grep "lvm" | grep "ceph"| awk '{print $1}' | grep ${osd_dev_split}) + disk_zap ${OSD_DEVICE} + CEPH_LVM_PREPARE=1 + fi + vgcreate ceph-db-wal-${block_wal_string} ${BLOCK_WAL} + VG=ceph-db-wal-${block_wal_string} + fi + if [[ $(lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-db-${block_db_string}") != "ceph-db-${block_db_string}" ]]; then + lvcreate -L ${BLOCK_DB_SIZE} -n ceph-db-${block_db_string} ${VG} + fi + BLOCK_DB=${VG}/ceph-db-${block_db_string} + if [[ $(lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-db-${block_wal_string}") != "ceph-db-${block_wal_string}" ]]; then + lvcreate -L ${BLOCK_WAL_SIZE} -n ceph-wal-${block_wal_string} ${VG} + fi + BLOCK_WAL=${VG}/ceph-wal-${block_wal_string} + fi + elif [[ -z ${BLOCK_DB} && ${BLOCK_WAL} ]]; then + if [[ $(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_wal_string}") ]]; then + VG=$(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_wal_string}") + WAL_OSD_ID=$(ceph-volume lvm list /dev/ceph-wal-${block_wal_string}/ceph-wal-${osd_dev_string} | grep "osd id" | awk '{print $3}') + if [ ! -z ${OSD_ID} ] && [ ${WAL_OSD_ID} != ${OSD_ID} ]; then + echo "Found VG, but corresponding WAL is not, zapping the ${OSD_DEVICE}" + disk_zap ${OSD_DEVICE} + CEPH_LVM_PREPARE=1 + elif [ ! -z ${OSD_ID} ] && [ -z ${WAL_OSD_ID} ]; then + disk_zap ${OSD_DEVICE} + CEPH_LVM_PREPARE=1 + elif [ -z ${OSD_ID} ]; then + CEPH_LVM_PREPARE=1 + else + CEPH_LVM_PREPARE=0 + fi + else + osd_dev_split=$(echo ${OSD_DEVICE} | awk -F "/" '{print $3}') + if [[ ! -z $(lsblk ${BLOCK_WAL} -o name,type -l | grep "lvm" | grep "ceph"| awk '{print $1}' | grep ${osd_dev_split}) ]]; then + echo "dmsetup reference found but disks mismatch" + dmsetup remove $(lsblk ${BLOCK_WAL} -o name,type -l | grep "lvm" | grep "ceph"| awk '{print $1}' | grep ${osd_dev_split}) + disk_zap ${OSD_DEVICE} + CEPH_LVM_PREPARE=1 + fi + vgcreate ceph-wal-${block_wal_string} ${BLOCK_WAL} + VG=ceph-wal-${block_wal_string} + fi + if [[ $(lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-wal-${osd_dev_string}") != "ceph-wal-${osd_dev_string}" ]]; then + lvcreate -L ${BLOCK_WAL_SIZE} -n ceph-wal-${osd_dev_string} ${VG} + fi + BLOCK_WAL=${VG}/ceph-wal-${osd_dev_string} + elif [[ ${BLOCK_DB} && -z ${BLOCK_WAL} ]]; then + if [[ $(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}") ]]; then + VG=$(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}") + DB_OSD_ID=$(ceph-volume lvm list /dev/ceph-db-${block_db_string}/ceph-db-${osd_dev_string} | grep "osd id" | awk '{print $3}') + if [ ! -z ${OSD_ID} ] && [ ${DB_OSD_ID} != ${OSD_ID} ]; then + echo "Found VG, but corresponding DB is not, zapping the ${OSD_DEVICE}" + disk_zap ${OSD_DEVICE} + CEPH_LVM_PREPARE=1 + elif [ ! -z ${OSD_ID} ] && [ -z ${DB_OSD_ID} ]; then + disk_zap ${OSD_DEVICE} + CEPH_LVM_PREPARE=1 + elif [ -z ${OSD_ID} ]; then + CEPH_LVM_PREPARE=1 + else + CEPH_LVM_PREPARE=0 + fi + else + osd_dev_split=$(echo ${OSD_DEVICE} | awk -F "/" '{print $3}') + if [[ ! -z $(lsblk ${BLOCK_DB} -o name,type -l | grep "lvm" | grep "ceph"| awk '{print $1}' | grep ${osd_dev_split}) ]]; then + echo "dmsetup reference found but disks mismatch" + dmsetup remove $(lsblk ${BLOCK_WAL} -o name,type -l | grep "lvm" | grep "ceph"| awk '{print $1}' | grep ${osd_dev_split}) + disk_zap ${OSD_DEVICE} + CEPH_LVM_PREPARE=1 + fi + vgcreate ceph-db-${block_db_string} ${BLOCK_DB} + VG=ceph-db-${block_db_string} + fi + if [[ $(lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-db-${osd_dev_string}") != "ceph-db-${osd_dev_string}" ]]; then + lvcreate -L ${BLOCK_DB_SIZE} -n ceph-db-${osd_dev_string} ${VG} + fi + BLOCK_DB=${VG}/ceph-db-${osd_dev_string} + flock -u "${lock_fd}" + fi + else + if pvdisplay ${OSD_DEVICE} | grep "VG Name" | awk '{print $3}' | grep "ceph"; then + CEPH_LVM_PREPARE=0 + fi + fi + + if [ "${OSD_BLUESTORE:-0}" -eq 1 ]; then + CLI_OPTS="${CLI_OPTS} --bluestore" + + if [ ! -z "$BLOCK_DB" ]; then + CLI_OPTS="${CLI_OPTS} --block.db ${BLOCK_DB}" + fi + + if [ ! -z "$BLOCK_WAL" ]; then + CLI_OPTS="${CLI_OPTS} --block.wal ${BLOCK_WAL}" + fi + else + # we only care about journals for filestore. + osd_journal_prepare + CLI_OPTS="${CLI_OPTS} --data ${OSD_DEVICE} --journal ${OSD_JOURNAL}" + udev_settle + fi + if [[ ${CEPH_DISK_USED} -eq 1 ]]; then + CLI_OPTS="${CLI_OPTS} --data ${OSD_DEVICE}" + ceph-volume simple scan --force ${OSD_DEVICE}$(sgdisk --print ${OSD_DEVICE} | grep "F800" | awk '{print $1}') + elif [[ ${CEPH_LVM_PREPARE} == 1 ]]; then + if [[ $(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "ceph-vg-${osd_dev_string}") ]]; then + OSD_VG=$(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "ceph-vg-${osd_dev_string}") + else + vgcreate ceph-vg-${osd_dev_string} ${OSD_DEVICE} + OSD_VG=ceph-vg-${osd_dev_string} + fi + if [[ $(lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-lv-${osd_dev_string}") != "ceph-lv-${osd_dev_string}" ]]; then + lvcreate --yes -l 100%FREE -n ceph-lv-${osd_dev_string} ${OSD_VG} + fi + OSD_LV=${OSD_VG}/ceph-lv-${osd_dev_string} + CLI_OPTS="${CLI_OPTS} --data ${OSD_LV}" + ceph-volume lvm -v prepare ${CLI_OPTS} + fi +} + +function osd_journal_create { + local osd_journal=${1} + local osd_journal_partition=$(echo ${osd_journal} | sed 's/[^0-9]//g') + local jdev=$(echo ${osd_journal} | sed 's/[0-9]//g') + if [ -b "${jdev}" ]; then + sgdisk --new=${osd_journal_partition}:0:+${OSD_JOURNAL_SIZE}M \ + --change-name='${osd_journal_partition}:ceph journal' \ + --partition-guid=${osd_journal_partition}:${OSD_JOURNAL_UUID} \ + --typecode=${osd_journal_partition}:45b0969e-9b03-4f30-b4c6-b4b80ceff106 --mbrtogpt -- ${jdev} + OSD_JOURNAL=$(dev_part ${jdev} ${osd_journal_partition}) + udev_settle + else + echo "The backing device ${jdev} for ${OSD_JOURNAL} does not exist on this system." + exit 1 + fi +} + +function osd_journal_prepare { + if [ -n "${OSD_JOURNAL}" ]; then + if [ -b ${OSD_JOURNAL} ]; then + OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL}) + OSD_JOURNAL_PARTITION=$(echo ${OSD_JOURNAL} | sed 's/[^0-9]//g') + local jdev=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g') + if [ -z "${OSD_JOURNAL_PARTITION}" ]; then + OSD_JOURNAL=$(dev_part ${jdev} ${OSD_JOURNAL_PARTITION}) + else + OSD_JOURNAL=${OSD_JOURNAL} + fi + elif [ "x$JOURNAL_TYPE" != "xdirectory" ]; then + # The block device exists but doesn't appear to be paritioned, we will proceed with parititioning the device. + OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL}) + until [ -b ${OSD_JOURNAL} ]; do + osd_journal_create ${OSD_JOURNAL} + done + fi + chown ceph. ${OSD_JOURNAL}; + elif [ "x$JOURNAL_TYPE" != "xdirectory" ]; then + echo "No journal device specified. OSD and journal will share ${OSD_DEVICE}" + echo "For better performance on HDD, consider moving your journal to a separate device" + fi + CLI_OPTS="${CLI_OPTS} --filestore" +} + +if ! [ "x${STORAGE_TYPE%-*}" == "xdirectory" ]; then + osd_disk_prepare +fi diff --git a/ceph-osd/templates/configmap-bin.yaml b/ceph-osd/templates/configmap-bin.yaml index b32bc9796..32eedcdcf 100644 --- a/ceph-osd/templates/configmap-bin.yaml +++ b/ceph-osd/templates/configmap-bin.yaml @@ -40,14 +40,22 @@ data: {{ tuple "bin/osd/ceph-disk/_bluestore.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} osd-init-ceph-disk.sh: | {{ tuple "bin/osd/ceph-disk/_init-with-ceph-disk.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} + osd-common-ceph-disk.sh: | +{{ tuple "bin/osd/ceph-disk/_common.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} + osd-block-ceph-volume.sh: | +{{ tuple "bin/osd/ceph-volume/_block.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} + osd-bluestore-ceph-volume.sh: | +{{ tuple "bin/osd/ceph-volume/_bluestore.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} + osd-init-ceph-volume.sh: | +{{ tuple "bin/osd/ceph-volume/_init-with-ceph-volume.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} + osd-common-ceph-volume.sh: | +{{ tuple "bin/osd/ceph-volume/_common.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} osd-init.sh: | {{ tuple "bin/osd/_init.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} osd-check.sh: | {{ tuple "bin/osd/_check.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} osd-stop.sh: | {{ tuple "bin/osd/_stop.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} - osd-common-ceph-disk.sh: | -{{ tuple "bin/osd/ceph-disk/_common.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} init-dirs.sh: | {{ tuple "bin/_init-dirs.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} helm-tests.sh: | diff --git a/ceph-osd/templates/daemonset-osd.yaml b/ceph-osd/templates/daemonset-osd.yaml index 77ad9b26d..8ec6c3149 100644 --- a/ceph-osd/templates/daemonset-osd.yaml +++ b/ceph-osd/templates/daemonset-osd.yaml @@ -50,6 +50,7 @@ spec: {{ .Values.labels.osd.node_selector_key }}: {{ .Values.labels.osd.node_selector_value }} hostNetwork: true hostPID: true + hostIPC: true dnsPolicy: {{ .Values.pod.dns_policy }} initContainers: {{ tuple $envAll "osd" list | include "helm-toolkit.snippets.kubernetes_entrypoint_init_container" | indent 8 }} @@ -179,10 +180,18 @@ spec: mountPath: /tmp/init-ceph-disk.sh subPath: osd-init-ceph-disk.sh readOnly: true + - name: ceph-osd-bin + mountPath: /tmp/init-ceph-volume.sh + subPath: osd-init-ceph-volume.sh + readOnly: true - name: ceph-osd-bin mountPath: /tmp/osd-common-ceph-disk.sh subPath: osd-common-ceph-disk.sh readOnly: true + - name: ceph-osd-bin + mountPath: /tmp/osd-common-ceph-volume.sh + subPath: osd-common-ceph-volume.sh + readOnly: true - name: ceph-osd-etc mountPath: /etc/ceph/ceph.conf.template subPath: ceph.conf @@ -207,12 +216,21 @@ spec: - name: run-lvm mountPath: /run/lvm readOnly: false + - name: run-udev + mountPath: /run/udev + readOnly: false + - name: pod-etc-lvm + mountPath: /etc/lvm + readOnly: false - name: data mountPath: /var/lib/ceph/osd readOnly: false - name: journal mountPath: /var/lib/ceph/journal readOnly: false + - name: pod-var-log + mountPath: /var/log/ceph + readOnly: false containers: - name: ceph-osd-default {{ tuple $envAll "ceph_osd" | include "helm-toolkit.snippets.image" | indent 10 }} @@ -284,10 +302,18 @@ spec: mountPath: /tmp/osd-block-ceph-disk.sh subPath: osd-block-ceph-disk.sh readOnly: true + - name: ceph-osd-bin + mountPath: /tmp/osd-block-ceph-volume.sh + subPath: osd-block-ceph-volume.sh + readOnly: true - name: ceph-osd-bin mountPath: /tmp/osd-bluestore-ceph-disk.sh subPath: osd-bluestore-ceph-disk.sh readOnly: true + - name: ceph-osd-bin + mountPath: /tmp/osd-bluestore-ceph-volume.sh + subPath: osd-bluestore-ceph-volume.sh + readOnly: true - name: ceph-osd-bin mountPath: /tmp/osd-check.sh subPath: osd-check.sh @@ -304,6 +330,10 @@ spec: mountPath: /tmp/osd-common-ceph-disk.sh subPath: osd-common-ceph-disk.sh readOnly: true + - name: ceph-osd-bin + mountPath: /tmp/osd-common-ceph-volume.sh + subPath: osd-common-ceph-volume.sh + readOnly: true - name: ceph-osd-bin mountPath: /tmp/utils-defragOSDs.sh subPath: utils-defragOSDs.sh @@ -329,6 +359,12 @@ spec: - name: run-lvm mountPath: /run/lvm readOnly: false + - name: run-udev + mountPath: /run/udev + readOnly: false + - name: pod-etc-lvm + mountPath: /etc/lvm + readOnly: false - name: data mountPath: /var/lib/ceph/osd readOnly: false @@ -354,6 +390,11 @@ spec: - name: run-lvm hostPath: path: /run/lvm + - name: run-udev + hostPath: + path: /run/udev + - name: pod-etc-lvm + emptyDir: {} - name: pod-var-lib-ceph emptyDir: {} - name: pod-var-lib-ceph-tmp diff --git a/ceph-osd/templates/utils/_osd_daemonset_overrides.tpl b/ceph-osd/templates/utils/_osd_daemonset_overrides.tpl index 2cbefdabe..85969f521 100644 --- a/ceph-osd/templates/utils/_osd_daemonset_overrides.tpl +++ b/ceph-osd/templates/utils/_osd_daemonset_overrides.tpl @@ -328,13 +328,13 @@ limitations under the License. {{- $tmpcontainerEnv := omit $context.Values._tmpYAMLcontainer "env" }} {{- if eq $v.data.type "bluestore" }} {{- if and $v.block_db $v.block_wal}} - {{ $containerEnv := prepend (prepend (prepend ( prepend (index $context.Values._tmpYAMLcontainer "env") (dict "name" "STORAGE_TYPE" "value" $v.data.type)) (dict "name" "STORAGE_LOCATION" "value" $v.data.location)) (dict "name" "BLOCK_DB" "value" $v.block_db)) (dict "name" "BLOCK_WAL" "value" $v.block_wal) }} + {{ $containerEnv := prepend (prepend (prepend ( prepend ( prepend ( prepend (index $context.Values._tmpYAMLcontainer "env") (dict "name" "STORAGE_TYPE" "value" $v.data.type)) (dict "name" "STORAGE_LOCATION" "value" $v.data.location)) (dict "name" "BLOCK_DB" "value" $v.block_db.location)) (dict "name" "BLOCK_DB_SIZE" "value" $v.block_db.size)) (dict "name" "BLOCK_WAL" "value" $v.block_wal.location)) (dict "name" "BLOCK_WAL_SIZE" "value" $v.block_wal.size) }} {{- $_ := set $tmpcontainerEnv "env" $containerEnv }} {{- else if $v.block_db }} - {{ $containerEnv := prepend (prepend ( prepend (index $context.Values._tmpYAMLcontainer "env") (dict "name" "STORAGE_TYPE" "value" $v.data.type)) (dict "name" "STORAGE_LOCATION" "value" $v.data.location)) (dict "name" "BLOCK_DB" "value" $v.block_db) }} + {{ $containerEnv := prepend (prepend ( prepend ( prepend (index $context.Values._tmpYAMLcontainer "env") (dict "name" "STORAGE_TYPE" "value" $v.data.type)) (dict "name" "STORAGE_LOCATION" "value" $v.data.location)) (dict "name" "BLOCK_DB" "value" $v.block_db.location)) (dict "name" "BLOCK_DB_SIZE" "value" $v.block_db.size) }} {{- $_ := set $tmpcontainerEnv "env" $containerEnv }} {{- else if $v.block_wal }} - {{ $containerEnv := prepend (prepend ( prepend (index $context.Values._tmpYAMLcontainer "env") (dict "name" "STORAGE_TYPE" "value" $v.data.type)) (dict "name" "STORAGE_LOCATION" "value" $v.data.location)) (dict "name" "BLOCK_WAL" "value" $v.block_wal) }} + {{ $containerEnv := prepend (prepend ( prepend ( prepend (index $context.Values._tmpYAMLcontainer "env") (dict "name" "STORAGE_TYPE" "value" $v.data.type)) (dict "name" "STORAGE_LOCATION" "value" $v.data.location)) (dict "name" "BLOCK_WAL" "value" $v.block_wal.location)) (dict "name" "BLOCK_WAL_SIZE" "value" $v.block_wal.size) }} {{- $_ := set $tmpcontainerEnv "env" $containerEnv }} {{ else }} {{ $containerEnv := prepend (prepend (index $context.Values._tmpYAMLcontainer "env") (dict "name" "STORAGE_TYPE" "value" $v.data.type)) (dict "name" "STORAGE_LOCATION" "value" $v.data.location) }} @@ -357,13 +357,13 @@ limitations under the License. {{- $tmpinitcontainerEnv := omit $context.Values._tmpYAMLinitContainer "env" }} {{- if eq $v.data.type "bluestore" }} {{- if and $v.block_db $v.block_wal}} - {{ $initcontainerEnv := prepend (prepend (prepend ( prepend (index $context.Values._tmpYAMLinitContainer "env") (dict "name" "STORAGE_TYPE" "value" $v.data.type)) (dict "name" "STORAGE_LOCATION" "value" $v.data.location)) (dict "name" "BLOCK_DB" "value" $v.block_db)) (dict "name" "BLOCK_WAL" "value" $v.block_wal) }} + {{ $initcontainerEnv := prepend (prepend (prepend ( prepend ( prepend ( prepend (index $context.Values._tmpYAMLinitContainer "env") (dict "name" "STORAGE_TYPE" "value" $v.data.type)) (dict "name" "STORAGE_LOCATION" "value" $v.data.location)) (dict "name" "BLOCK_DB" "value" $v.block_db.location)) (dict "name" "BLOCK_DB_SIZE" "value" $v.block_db.size)) (dict "name" "BLOCK_WAL" "value" $v.block_wal.location)) (dict "name" "BLOCK_WAL_SIZE" "value" $v.block_wal.size) }} {{- $_ := set $tmpinitcontainerEnv "env" $initcontainerEnv }} {{- else if $v.block_db }} - {{ $initcontainerEnv := prepend (prepend ( prepend (index $context.Values._tmpYAMLinitContainer "env") (dict "name" "STORAGE_TYPE" "value" $v.data.type)) (dict "name" "STORAGE_LOCATION" "value" $v.data.location)) (dict "name" "BLOCK_DB" "value" $v.block_db) }} + {{ $initcontainerEnv := prepend (prepend ( prepend ( prepend (index $context.Values._tmpYAMLinitContainer "env") (dict "name" "STORAGE_TYPE" "value" $v.data.type)) (dict "name" "STORAGE_LOCATION" "value" $v.data.location)) (dict "name" "BLOCK_DB" "value" $v.block_db.location)) (dict "name" "BLOCK_DB_SIZE" "value" $v.block_db.size) }} {{- $_ := set $tmpinitcontainerEnv "env" $initcontainerEnv }} {{- else if $v.block_wal }} - {{ $initcontainerEnv := prepend (prepend ( prepend (index $context.Values._tmpYAMLinitContainer "env") (dict "name" "STORAGE_TYPE" "value" $v.data.type)) (dict "name" "STORAGE_LOCATION" "value" $v.data.location)) (dict "name" "BLOCK_WAL" "value" $v.block_wal) }} + {{ $initcontainerEnv := prepend (prepend ( prepend ( prepend (index $context.Values._tmpYAMLinitContainer "env") (dict "name" "STORAGE_TYPE" "value" $v.data.type)) (dict "name" "STORAGE_LOCATION" "value" $v.data.location)) (dict "name" "BLOCK_WAL" "value" $v.block_wal.location)) (dict "name" "BLOCK_WAL_SIZE" "value" $v.block_wal.size) }} {{- $_ := set $tmpinitcontainerEnv "env" $initcontainerEnv }} {{ else }} {{ $initcontainerEnv := prepend (prepend (index $context.Values._tmpYAMLinitContainer "env") (dict "name" "STORAGE_TYPE" "value" $v.data.type)) (dict "name" "STORAGE_LOCATION" "value" $v.data.location) }} diff --git a/ceph-osd/values.yaml b/ceph-osd/values.yaml index d307df074..04128131b 100644 --- a/ceph-osd/values.yaml +++ b/ceph-osd/values.yaml @@ -42,6 +42,10 @@ labels: node_selector_key: ceph-osd node_selector_value: enabled +#We could deploy ceph cluster now with either ceph-volume or ceph-disk however +#ceph-disk is deprecated from Nautilus. +#Keeping ceph-disk as default since gate scripts are still directory backed +#osds, need to change this after moving the gates to disk backed osd. deploy: tool: "ceph-disk" @@ -209,12 +213,22 @@ conf: location: /var/lib/openstack-helm/ceph/osd/journal-one # - data: - # type: bluestore - # location: /dev/sdb + # type: bluestore + # location: /dev/sdb # Separate block devices may be used for block.db and/or block.wal # Without these values they will be co-located on the data volume - # block_db: /dev/sdc - # block_wal: /dev/sdc + # Specify the location and size in Gb. It is recommended that the + # block_db size isn’t smaller than 4% of block. For example, if the + # block size is 1TB, then block_db shouldn’t be less than 40GB. + # A size suffix of K for kilobytes, M for megabytes, G for gigabytes, + # T for terabytes, P for petabytes or E for exabytes is optional. + # Default unit is megabytes. + # block_db: + # location: /dev/sdc + # size: "96GB" + # block_wal: + # location: /dev/sdc + # size: "2GB" # - data: # type: block-logical