[ceph-osd] Alias synchronized commands and fix descriptor leak

There are many race conditions possible when multiple ceph-osd
pods are initialized on the same host at the same time using
shared metadata disks. The locked() function was introduced a
while back to address these, but some commands weren't locked,
locked() was being called all over the place, and there was a file
descriptor leak in locked(). This change cleans that up by
by maintaining a single, global file descriptor for the lock file
that is only opened and closed once, and also by aliasing all of
the commands that need to use locked() and removing explicit calls
to locked() everywhere.

The global_locked() function has also been removed as it isn't
needed when individual commands that interact with disks use
locked() properly.

Change-Id: I0018cf0b3a25bced44c57c40e33043579c42de7a
This commit is contained in:
Stephen Taylor 2020-12-14 11:47:27 -07:00
parent 9b1ac0ffcb
commit 885285139e
5 changed files with 142 additions and 58 deletions

View File

@ -15,6 +15,6 @@ apiVersion: v1
appVersion: v1.0.0
description: OpenStack-Helm Ceph OSD
name: ceph-osd
version: 0.1.13
version: 0.1.14
home: https://github.com/ceph/ceph
...

View File

@ -150,3 +150,6 @@ exec /usr/bin/ceph-osd \
--setuser ceph \
--setgroup disk & echo $! > /run/ceph-osd.pid
wait
# Clean up resources held by the common script
common_cleanup

View File

@ -111,3 +111,6 @@ exec /usr/bin/ceph-osd \
--setuser ceph \
--setgroup disk & echo $! > /run/ceph-osd.pid
wait
# Clean up resources held by the common script
common_cleanup

View File

@ -15,6 +15,9 @@ limitations under the License.
*/}}
set -ex
shopt -s expand_aliases
export lock_fd=''
export ALREADY_LOCKED=0
export PS4='+${BASH_SOURCE:+$(basename ${BASH_SOURCE}):${LINENO}:}${FUNCNAME:+${FUNCNAME}():} '
: "${CRUSH_LOCATION:=root=default host=${HOSTNAME}}"
@ -25,6 +28,85 @@ export PS4='+${BASH_SOURCE:+$(basename ${BASH_SOURCE}):${LINENO}:}${FUNCNAME:+${
: "${OSD_JOURNAL_SIZE:=$(awk '/^osd_journal_size/{print $3}' ${CEPH_CONF}.template)}"
: "${OSD_WEIGHT:=1.0}"
# Obtain a global lock on /var/lib/ceph/tmp/init-osd.lock
function lock() {
# Open a file descriptor for the lock file if there isn't one already
if [[ -z "${lock_fd}" ]]; then
exec {lock_fd}>/var/lib/ceph/tmp/init-osd.lock || exit 1
fi
flock -w 600 "${lock_fd}" &> /dev/null
ALREADY_LOCKED=1
}
# Release the global lock on /var/lib/ceph/tmp/init-osd.lock
function unlock() {
flock -u "${lock_fd}" &> /dev/null
ALREADY_LOCKED=0
}
# "Destructor" for common.sh, must be called by scripts that source this one
function common_cleanup() {
# Close the file descriptor for the lock file
if [[ ! -z "${lock_fd}" ]]; then
if [[ ${ALREADY_LOCKED} -ne 0 ]]; then
unlock
fi
eval "exec ${lock_fd}>&-"
fi
}
# Run a command within the global synchronization lock
function locked() {
local LOCK_SCOPE=0
# Allow locks to be re-entrant to avoid deadlocks
if [[ ${ALREADY_LOCKED} -eq 0 ]]; then
lock
LOCK_SCOPE=1
fi
# Execute the synchronized command
"$@"
# Only unlock if the lock was obtained in this scope
if [[ ${LOCK_SCOPE} -ne 0 ]]; then
unlock
fi
}
# Alias commands that interact with disks so they are always synchronized
alias dmsetup='locked dmsetup'
alias pvs='locked pvs'
alias vgs='locked vgs'
alias lvs='locked lvs'
alias pvdisplay='locked pvdisplay'
alias vgdisplay='locked vgdisplay'
alias lvdisplay='locked lvdisplay'
alias pvcreate='locked pvcreate'
alias vgcreate='locked vgcreate'
alias lvcreate='locked lvcreate'
alias pvremove='locked pvremove'
alias vgremove='locked vgremove'
alias lvremove='locked lvremove'
alias pvrename='locked pvrename'
alias vgrename='locked vgrename'
alias lvrename='locked lvrename'
alias pvchange='locked pvchange'
alias vgchange='locked vgchange'
alias lvchange='locked lvchange'
alias pvscan='locked pvscan'
alias vgscan='locked vgscan'
alias lvscan='locked lvscan'
alias lvm_scan='locked lvm_scan'
alias partprobe='locked partprobe'
alias ceph-volume='locked ceph-volume'
alias disk_zap='locked disk_zap'
alias zap_extra_partitions='locked zap_extra_partitions'
alias udev_settle='locked udev_settle'
alias wipefs='locked wipefs'
alias sgdisk='locked sgdisk'
alias dd='locked dd'
eval CRUSH_FAILURE_DOMAIN_TYPE=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain"]))')
eval CRUSH_FAILURE_DOMAIN_NAME=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain_name"]))')
eval CRUSH_FAILURE_DOMAIN_NAME=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain_name"]))')
@ -74,19 +156,6 @@ function ceph_cmd_retry() {
done
}
function locked() {
exec {lock_fd}>/var/lib/ceph/tmp/init-osd.lock || exit 1
flock -w 600 --verbose "${lock_fd}" &> /dev/null
"$@"
flock -u "${lock_fd}" &> /dev/null
}
function global_locked() {
exec {global_lock_fd}>/var/lib/ceph/tmp/init-osd-global.lock || exit 1
flock -w 600 --verbose "${global_lock_fd}" &> /dev/null
"$@"
flock -u "${global_lock_fd}" &> /dev/null
}
function crush_create_or_move {
local crush_location=${1}
ceph_cmd_retry --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
@ -242,13 +311,13 @@ function disk_zap {
dmsetup remove ${dm_device}
fi
done
local logical_volumes=$(locked lvdisplay | grep "LV Path" | grep "$device_filter" | awk '/ceph/{print $3}' | tr '\n' ' ')
local logical_volumes=$(lvdisplay | grep "LV Path" | grep "$device_filter" | awk '/ceph/{print $3}' | tr '\n' ' ')
for logical_volume in ${logical_volumes}; do
if [[ ! -z ${logical_volume} ]]; then
locked lvremove -y ${logical_volume}
lvremove -y ${logical_volume}
fi
done
local volume_group=$(locked pvdisplay -ddd -v ${device} | grep "VG Name" | awk '/ceph/{print $3}' | grep "ceph")
local volume_group=$(pvdisplay -ddd -v ${device} | grep "VG Name" | awk '/ceph/{print $3}' | grep "ceph")
if [[ ${volume_group} ]]; then
vgremove -y ${volume_group}
pvremove -y ${device}
@ -274,7 +343,7 @@ function udev_settle {
osd_devices="${OSD_DEVICE}"
udevadm settle --timeout=600
partprobe "${OSD_DEVICE}"
locked lvm_scan
lvm_scan
if [ "${OSD_BLUESTORE:-0}" -eq 1 ]; then
if [ ! -z "$BLOCK_DB" ]; then
osd_devices="${osd_devices}\|${BLOCK_DB}"
@ -282,9 +351,9 @@ function udev_settle {
local block_db="$BLOCK_DB"
local db_vg="$(echo $block_db | cut -d'/' -f1)"
if [ ! -z "$db_vg" ]; then
block_db=$(locked pvdisplay -ddd -v | grep -B1 "$db_vg" | awk '/PV Name/{print $3}')
block_db=$(pvdisplay -ddd -v | grep -B1 "$db_vg" | awk '/PV Name/{print $3}')
fi
locked partprobe "${block_db}"
partprobe "${block_db}"
fi
if [ ! -z "$BLOCK_WAL" ] && [ "$BLOCK_WAL" != "$BLOCK_DB" ]; then
osd_devices="${osd_devices}\|${BLOCK_WAL}"
@ -292,9 +361,9 @@ function udev_settle {
local block_wal="$BLOCK_WAL"
local wal_vg="$(echo $block_wal | cut -d'/' -f1)"
if [ ! -z "$wal_vg" ]; then
block_wal=$(locked pvdisplay -ddd -v | grep -B1 "$wal_vg" | awk '/PV Name/{print $3}')
block_wal=$(pvdisplay -ddd -v | grep -B1 "$wal_vg" | awk '/PV Name/{print $3}')
fi
locked partprobe "${block_wal}"
partprobe "${block_wal}"
fi
else
if [ "x$JOURNAL_TYPE" == "xblock-logical" ] && [ ! -z "$OSD_JOURNAL" ]; then
@ -302,7 +371,7 @@ function udev_settle {
if [ ! -z "$OSD_JOURNAL" ]; then
local JDEV=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g')
osd_devices="${osd_devices}\|${JDEV}"
locked partprobe "${JDEV}"
partprobe "${JDEV}"
fi
fi
fi
@ -328,7 +397,7 @@ function udev_settle {
function get_lv_from_device {
device="$1"
locked pvdisplay -ddd -v -m ${device} | awk '/Logical volume/{print $3}'
pvdisplay -ddd -v -m ${device} | awk '/Logical volume/{print $3}'
}
# Helper function to get an lvm tag from a logical volume
@ -341,7 +410,7 @@ function get_lvm_tag_from_volume {
echo
else
# Get and return the specified tag from the logical volume
locked lvs -o lv_tags ${logical_volume} | tr ',' '\n' | grep ${tag} | cut -d'=' -f2
lvs -o lv_tags ${logical_volume} | tr ',' '\n' | grep ${tag} | cut -d'=' -f2
fi
}
@ -361,7 +430,7 @@ function get_lv_size_from_device {
device="$1"
logical_volume="$(get_lv_from_device ${device})"
locked lvs ${logical_volume} -o LV_SIZE --noheadings --units k --nosuffix | xargs | cut -d'.' -f1
lvs ${logical_volume} -o LV_SIZE --noheadings --units k --nosuffix | xargs | cut -d'.' -f1
}
# Helper function to get the crush weight for an osd device
@ -435,12 +504,12 @@ function get_lvm_path_from_device {
select="$1"
options="--noheadings -o lv_dm_path"
locked pvs ${options} -S "${select}" | tr -d ' '
pvs ${options} -S "${select}" | tr -d ' '
}
function get_vg_name_from_device {
device="$1"
pv_uuid=$(locked pvdisplay -ddd -v ${device} | awk '/PV UUID/{print $3}')
pv_uuid=$(pvdisplay -ddd -v ${device} | awk '/PV UUID/{print $3}')
if [[ "${pv_uuid}" ]]; then
echo "ceph-vg-${pv_uuid}"
@ -450,7 +519,7 @@ function get_vg_name_from_device {
function get_lv_name_from_device {
device="$1"
device_type="$2"
pv_uuid=$(locked pvdisplay -ddd -v ${device} | awk '/PV UUID/{print $3}')
pv_uuid=$(pvdisplay -ddd -v ${device} | awk '/PV UUID/{print $3}')
if [[ "${pv_uuid}" ]]; then
echo "ceph-${device_type}-${pv_uuid}"

View File

@ -38,36 +38,42 @@ else
export OSD_JOURNAL=$(readlink -f ${JOURNAL_LOCATION})
fi
# Set up aliases for functions that require disk synchronization
alias rename_vg='locked rename_vg'
alias rename_lvs='locked rename_lvs'
alias update_lv_tags='locked update_lv_tags'
alias prep_device='locked prep_device'
# Renames a single VG if necessary
function rename_vg {
local physical_disk=$1
local old_vg_name=$(locked pvdisplay -ddd -v ${physical_disk} | awk '/VG Name/{print $3}')
local old_vg_name=$(pvdisplay -ddd -v ${physical_disk} | awk '/VG Name/{print $3}')
local vg_name=$(get_vg_name_from_device ${physical_disk})
if [[ "${old_vg_name}" ]] && [[ "${vg_name}" != "${old_vg_name}" ]]; then
locked vgrename ${old_vg_name} ${vg_name}
vgrename ${old_vg_name} ${vg_name}
fi
}
# Renames all LVs associated with an OSD as necesasry
function rename_lvs {
local data_disk=$1
local vg_name=$(locked pvdisplay -ddd -v ${data_disk} | awk '/VG Name/{print $3}')
local vg_name=$(pvdisplay -ddd -v ${data_disk} | awk '/VG Name/{print $3}')
if [[ "${vg_name}" ]]; then
# Rename the OSD volume if necessary
local old_lv_name=$(locked lvdisplay ${vg_name} | awk '/LV Name/{print $3}')
local old_lv_name=$(lvdisplay ${vg_name} | awk '/LV Name/{print $3}')
local lv_name=$(get_lv_name_from_device ${data_disk} lv)
if [[ "${old_lv_name}" ]] && [[ "${lv_name}" != "${old_lv_name}" ]]; then
locked lvrename ${vg_name} ${old_lv_name} ${lv_name}
lvrename ${vg_name} ${old_lv_name} ${lv_name}
fi
# Rename the OSD's block.db volume if necessary, referenced by UUID
local lv_tag=$(get_lvm_tag_from_device ${data_disk} ceph.db_uuid)
if [[ "${lv_tag}" ]]; then
local lv_device=$(locked lvdisplay | grep -B4 "${lv_tag}" | awk '/LV Path/{print $3}')
local lv_device=$(lvdisplay | grep -B4 "${lv_tag}" | awk '/LV Path/{print $3}')
if [[ "${lv_device}" ]]; then
local db_vg=$(echo ${lv_device} | awk -F "/" '{print $3}')
@ -75,7 +81,7 @@ function rename_lvs {
local db_name=$(get_lv_name_from_device ${data_disk} db)
if [[ "${old_lv_name}" ]] && [[ "${db_name}" != "${old_lv_name}" ]]; then
locked lvrename ${db_vg} ${old_lv_name} ${db_name}
lvrename ${db_vg} ${old_lv_name} ${db_name}
fi
fi
fi
@ -84,7 +90,7 @@ function rename_lvs {
lv_tag=$(get_lvm_tag_from_device ${data_disk} ceph.wal_uuid)
if [[ "${lv_tag}" ]]; then
local lv_device=$(locked lvdisplay | grep -B4 "${lv_tag}" | awk '/LV Path/{print $3}')
local lv_device=$(lvdisplay | grep -B4 "${lv_tag}" | awk '/LV Path/{print $3}')
if [[ "${lv_device}" ]]; then
local wal_vg=$(echo ${lv_device} | awk -F "/" '{print $3}')
@ -92,7 +98,7 @@ function rename_lvs {
local wal_name=$(get_lv_name_from_device ${data_disk} wal)
if [[ "${old_lv_name}" ]] && [[ "${wal_name}" != "${old_lv_name}" ]]; then
locked lvrename ${wal_vg} ${old_lv_name} ${wal_name}
lvrename ${wal_vg} ${old_lv_name} ${wal_name}
fi
fi
fi
@ -104,10 +110,10 @@ function rename_lvs {
# renaming should be completed prior to calling this
function update_lv_tags {
local data_disk=$1
local pv_uuid=$(locked pvdisplay -ddd -v ${data_disk} | awk '/PV UUID/{print $3}')
local pv_uuid=$(pvdisplay -ddd -v ${data_disk} | awk '/PV UUID/{print $3}')
if [[ "${pv_uuid}" ]]; then
local volumes="$(locked lvs --no-headings | grep -e "${pv_uuid}")"
local volumes="$(lvs --no-headings | grep -e "${pv_uuid}")"
local block_device db_device wal_device vg_name
local old_block_device old_db_device old_wal_device
@ -131,21 +137,21 @@ function update_lv_tags {
while read lv vg other_stuff; do
if [[ "${block_device}" ]]; then
if [[ "${old_block_device}" ]]; then
locked lvchange --deltag "ceph.block_device=${old_block_device}" /dev/${vg}/${lv}
lvchange --deltag "ceph.block_device=${old_block_device}" /dev/${vg}/${lv}
fi
locked lvchange --addtag "ceph.block_device=${block_device}" /dev/${vg}/${lv}
lvchange --addtag "ceph.block_device=${block_device}" /dev/${vg}/${lv}
fi
if [[ "${db_device}" ]]; then
if [[ "${old_db_device}" ]]; then
locked lvchange --deltag "ceph.db_device=${old_db_device}" /dev/${vg}/${lv}
lvchange --deltag "ceph.db_device=${old_db_device}" /dev/${vg}/${lv}
fi
locked lvchange --addtag "ceph.db_device=${db_device}" /dev/${vg}/${lv}
lvchange --addtag "ceph.db_device=${db_device}" /dev/${vg}/${lv}
fi
if [[ "${wal_device}" ]]; then
if [[ "${old_wal_device}" ]]; then
locked lvchange --deltag "ceph.wal_device=${old_wal_device}" /dev/${vg}/${lv}
lvchange --deltag "ceph.wal_device=${old_wal_device}" /dev/${vg}/${lv}
fi
locked lvchange --addtag "ceph.wal_device=${wal_device}" /dev/${vg}/${lv}
lvchange --addtag "ceph.wal_device=${wal_device}" /dev/${vg}/${lv}
fi
done <<< ${volumes}
fi
@ -188,7 +194,7 @@ function prep_device {
udev_settle
vg_name=$(get_vg_name_from_device ${BLOCK_DEVICE})
lv_name=$(get_lv_name_from_device ${data_disk} ${device_type})
VG=$(locked vgs --noheadings -o vg_name -S "vg_name=${vg_name}" | tr -d '[:space:]')
VG=$(vgs --noheadings -o vg_name -S "vg_name=${vg_name}" | tr -d '[:space:]')
if [[ $VG ]]; then
DEVICE_OSD_ID=$(get_osd_id_from_volume "/dev/${vg_name}/${lv_name}")
CEPH_LVM_PREPARE=1
@ -207,13 +213,13 @@ function prep_device {
CEPH_LVM_PREPARE=1
fi
random_uuid=$(uuidgen)
locked vgcreate "ceph-vg-${random_uuid}" "${BLOCK_DEVICE}"
vgcreate "ceph-vg-${random_uuid}" "${BLOCK_DEVICE}"
VG=$(get_vg_name_from_device ${BLOCK_DEVICE})
locked vgrename "ceph-vg-${random_uuid}" "${VG}"
vgrename "ceph-vg-${random_uuid}" "${VG}"
fi
logical_volume=$(locked lvs --noheadings -o lv_name -S "lv_name=${lv_name}" | tr -d '[:space:]')
logical_volume=$(lvs --noheadings -o lv_name -S "lv_name=${lv_name}" | tr -d '[:space:]')
if [[ $logical_volume != "${lv_name}" ]]; then
locked lvcreate -L "${BLOCK_DEVICE_SIZE}" -n "${lv_name}" "${VG}"
lvcreate -L "${BLOCK_DEVICE_SIZE}" -n "${lv_name}" "${VG}"
fi
if [[ "${device_type}" == "db" ]]; then
BLOCK_DB="${VG}/${lv_name}"
@ -399,7 +405,7 @@ function osd_disk_prepare {
OSD_VG=${vg_name}
fi
lv_name=$(get_lv_name_from_device ${OSD_DEVICE} lv)
if [[ ! "$(locked lvdisplay | awk '/LV Name/{print $3}' | grep ${lv_name})" ]]; then
if [[ ! "$(lvdisplay | awk '/LV Name/{print $3}' | grep ${lv_name})" ]]; then
lvcreate --yes -l 100%FREE -n ${lv_name} ${OSD_VG}
fi
OSD_LV=${OSD_VG}/${lv_name}
@ -416,15 +422,15 @@ function osd_disk_prepare {
block_wal_string=$(echo ${BLOCK_WAL} | awk -F "/" '{print $2 "-" $3}')
fi
if [[ ${BLOCK_DB} && ${BLOCK_WAL} ]]; then
global_locked prep_device "${BLOCK_DB}" "${BLOCK_DB_SIZE}" "db" "${OSD_DEVICE}"
global_locked prep_device "${BLOCK_WAL}" "${BLOCK_WAL_SIZE}" "wal" "${OSD_DEVICE}"
prep_device "${BLOCK_DB}" "${BLOCK_DB_SIZE}" "db" "${OSD_DEVICE}"
prep_device "${BLOCK_WAL}" "${BLOCK_WAL_SIZE}" "wal" "${OSD_DEVICE}"
elif [[ -z ${BLOCK_DB} && ${BLOCK_WAL} ]]; then
global_locked prep_device "${BLOCK_WAL}" "${BLOCK_WAL_SIZE}" "wal" "${OSD_DEVICE}"
prep_device "${BLOCK_WAL}" "${BLOCK_WAL_SIZE}" "wal" "${OSD_DEVICE}"
elif [[ ${BLOCK_DB} && -z ${BLOCK_WAL} ]]; then
global_locked prep_device "${BLOCK_DB}" "${BLOCK_DB_SIZE}" "db" "${OSD_DEVICE}"
prep_device "${BLOCK_DB}" "${BLOCK_DB_SIZE}" "db" "${OSD_DEVICE}"
fi
else
if locked pvdisplay -ddd -v ${OSD_DEVICE} | awk '/VG Name/{print $3}' | grep "ceph"; then
if pvdisplay -ddd -v ${OSD_DEVICE} | awk '/VG Name/{print $3}' | grep "ceph"; then
CEPH_LVM_PREPARE=0
fi
fi
@ -451,7 +457,7 @@ function osd_disk_prepare {
fi
if [[ CEPH_LVM_PREPARE -eq 1 ]]; then
locked ceph-volume lvm -v prepare ${CLI_OPTS}
ceph-volume lvm -v prepare ${CLI_OPTS}
udev_settle
fi
}
@ -502,3 +508,6 @@ function osd_journal_prepare {
if ! [ "x${STORAGE_TYPE%-*}" == "xdirectory" ]; then
osd_disk_prepare
fi
# Clean up resources held by the common script
common_cleanup