diff --git a/bsp-files/centos-ks-gen.pl b/bsp-files/centos-ks-gen.pl index 32c4a2e4..469837b3 100755 --- a/bsp-files/centos-ks-gen.pl +++ b/bsp-files/centos-ks-gen.pl @@ -42,6 +42,7 @@ write_config_file("controller", "pre_pkglist.cfg", "pre_disk_setup_common.cfg", "pre_disk_controller.cfg", + "pre_disk_setup_tail.cfg", "post_platform_conf_controller.cfg", "post_common.cfg", "post_kernel_controller.cfg", @@ -54,6 +55,7 @@ write_config_file("controller-worker", "pre_pkglist.cfg", "pre_disk_setup_common.cfg", "pre_disk_aio.cfg", + "pre_disk_setup_tail.cfg", "post_platform_conf_aio.cfg", "post_common.cfg", "post_kernel_aio_and_worker.cfg", @@ -67,6 +69,7 @@ write_config_file("controller-worker-lowlatency", "pre_pkglist_lowlatency.cfg", "pre_disk_setup_common.cfg", "pre_disk_aio.cfg", + "pre_disk_setup_tail.cfg", "post_platform_conf_aio_lowlatency.cfg", "post_common.cfg", "post_kernel_aio_and_worker.cfg", @@ -84,6 +87,7 @@ write_config_file("controller", "pre_pkglist.cfg", "pre_disk_setup_common.cfg", "pre_disk_controller.cfg", + "pre_disk_setup_tail.cfg", "post_platform_conf_controller.cfg", "post_common.cfg", "post_kernel_controller.cfg", @@ -95,6 +99,7 @@ write_config_file("controller-worker", "pre_pkglist.cfg", "pre_disk_setup_common.cfg", "pre_disk_aio.cfg", + "pre_disk_setup_tail.cfg", "post_platform_conf_aio.cfg", "post_common.cfg", "post_kernel_aio_and_worker.cfg", @@ -107,6 +112,7 @@ write_config_file("controller-worker-lowlatency", "pre_pkglist_lowlatency.cfg", "pre_disk_setup_common.cfg", "pre_disk_aio.cfg", + "pre_disk_setup_tail.cfg", "post_platform_conf_aio_lowlatency.cfg", "post_common.cfg", "post_kernel_aio_and_worker.cfg", @@ -123,6 +129,7 @@ write_config_file("controller", "pre_pkglist.cfg", "pre_disk_setup_common.cfg", "pre_disk_controller.cfg", + "pre_disk_setup_tail.cfg", "post_platform_conf_controller.cfg", "post_common.cfg", "post_kernel_controller.cfg", @@ -136,6 +143,7 @@ write_config_file("controller-worker", "pre_pkglist.cfg", "pre_disk_setup_common.cfg", "pre_disk_aio.cfg", + "pre_disk_setup_tail.cfg", "post_platform_conf_aio.cfg", "post_common.cfg", "post_kernel_aio_and_worker.cfg", @@ -150,6 +158,7 @@ write_config_file("controller-worker-lowlatency", "pre_pkglist_lowlatency.cfg", "pre_disk_setup_common.cfg", "pre_disk_aio.cfg", + "pre_disk_setup_tail.cfg", "post_platform_conf_aio_lowlatency.cfg", "post_common.cfg", "post_kernel_aio_and_worker.cfg", @@ -164,6 +173,7 @@ write_config_file("worker", "pre_pkglist.cfg", "pre_disk_setup_common.cfg", "pre_disk_worker.cfg", + "pre_disk_setup_tail.cfg", "post_platform_conf_worker.cfg", "post_common.cfg", "post_kernel_aio_and_worker.cfg", @@ -176,6 +186,7 @@ write_config_file("worker-lowlatency", "pre_pkglist_lowlatency.cfg", "pre_disk_setup_common.cfg", "pre_disk_worker.cfg", + "pre_disk_setup_tail.cfg", "post_platform_conf_worker_lowlatency.cfg", "post_common.cfg", "post_kernel_aio_and_worker.cfg", @@ -188,6 +199,7 @@ write_config_file("storage", "pre_pkglist.cfg", "pre_disk_setup_common.cfg", "pre_disk_storage.cfg", + "pre_disk_setup_tail.cfg", "post_platform_conf_storage.cfg", "post_common.cfg", "post_kernel_storage.cfg", @@ -208,6 +220,7 @@ foreach $server (keys %boot_servers) "pre_pkglist.cfg", "pre_disk_setup_common.cfg", "pre_disk_controller.cfg", + "pre_disk_setup_tail.cfg", "post_platform_conf_controller.cfg", "post_common.cfg", "post_kernel_controller.cfg", @@ -219,6 +232,7 @@ foreach $server (keys %boot_servers) "pre_pkglist.cfg", "pre_disk_setup_common.cfg", "pre_disk_aio.cfg", + "pre_disk_setup_tail.cfg", "post_platform_conf_aio.cfg", "post_common.cfg", "post_kernel_aio_and_worker.cfg", @@ -231,6 +245,7 @@ foreach $server (keys %boot_servers) "pre_pkglist_lowlatency.cfg", "pre_disk_setup_common.cfg", "pre_disk_aio.cfg", + "pre_disk_setup_tail.cfg", "post_platform_conf_aio_lowlatency.cfg", "post_common.cfg", "post_kernel_aio_and_worker.cfg", diff --git a/bsp-files/kickstarts/functions.sh b/bsp-files/kickstarts/functions.sh index ebbbe29f..2e3a8711 100644 --- a/bsp-files/kickstarts/functions.sh +++ b/bsp-files/kickstarts/functions.sh @@ -9,6 +9,17 @@ cat </tmp/ks-functions.sh # SPDX-License-Identifier: Apache-2.0 # +# Get the FD used by subshells to log output +if [ -z "\$stdout" ]; then + exec {stdout}>&1 +fi + +function wlog() +{ + local dt="\$(date "+%Y-%m-%d %H:%M:%S.%3N")" + echo "\$dt - \$1" >&\${stdout} +} + function get_by_path() { local disk=\$(cd /dev ; readlink -f \$1) @@ -73,7 +84,7 @@ function get_http_port() echo \$(cat /proc/cmdline |xargs -n1 echo |grep '^inst.repo=' | sed -r 's#^[^/]*://[^/]*:([0-9]*)/.*#\1#') } -get_disk_dev() +function get_disk_dev() { local disk # Detect HDD @@ -97,5 +108,52 @@ get_disk_dev() done } +function exec_no_fds() +{ + # Close open FDs when executing commands that complain about leaked FDs. + local fds=\$1 + local cmd=\$2 + local retries=\$3 + local interval=\$4 + local ret_code=0 + local ret_stdout="" + for fd in \$fds + do + local cmd="\$cmd \$fd>&-" + done + if [ -z "\$retries" ]; then + #wlog "Running command: '\$cmd'." + eval "\$cmd" + else + ret_stdout=\$(exec_retry "\$retries" "\$interval" "\$cmd") + ret_code=\$? + echo "\${ret_stdout}" + return \${ret_code} + fi +} + +function exec_retry() +{ + local retries=\$1 + local interval=\$2 + local cmd=\$3 + let -i retry_count=1 + local ret_code=0 + local ret_stdout="" + cmd="\$cmd" # 2>&\$stdout" + while [ \$retry_count -le \$retries ]; do + #wlog "Running command: '\$cmd'." + ret_stdout=\$(eval \$cmd) + ret_code=\$? + [ \$ret_code -eq 0 ] && break + wlog "Error running command '\${cmd}'. Try \${retry_count} of \${retries} at \${interval}s." + wlog "ret_code: \${ret_code}, stdout: '\${ret_stdout}'." + sleep \$interval + let retry_count++ + done + echo "\${ret_stdout}" + return \${ret_code} +} + END_FUNCTIONS diff --git a/bsp-files/kickstarts/post_common.cfg b/bsp-files/kickstarts/post_common.cfg index b84da62f..82140084 100644 --- a/bsp-files/kickstarts/post_common.cfg +++ b/bsp-files/kickstarts/post_common.cfg @@ -1,9 +1,12 @@ -%post --nochroot +%post --nochroot --erroronfail + +# Source common functions +. /tmp/ks-functions.sh # Change GUID of backup partition change_guid=/tmp/backup-guid-change.sh if [ -f "$change_guid" ]; then - sh $change_guid + sh $change_guid || report_post_failure_with_logfile "ERROR: Failed to update platform backup GUID" fi %end diff --git a/bsp-files/kickstarts/pre_disk_aio.cfg b/bsp-files/kickstarts/pre_disk_aio.cfg index 5832ac50..ded2b735 100755 --- a/bsp-files/kickstarts/pre_disk_aio.cfg +++ b/bsp-files/kickstarts/pre_disk_aio.cfg @@ -85,7 +85,7 @@ ## cgts-vg PV (142G), cgts-vg PV (336G) ## -sz=$(blockdev --getsize64 $(get_disk $ROOTFS_DISK)) +sz=$(blockdev --getsize64 $rootfs_device) if [ $sz -le $((240*$gb)) ] ; then # Round CGCS_PV_SIZE to the closest upper value that can be divided by 1024. # 190480/1024=186.02. CGCS_PV_SIZE=187*1024=191488. Using a disk with a @@ -119,12 +119,16 @@ if [ -d /sys/firmware/efi ] ; then END_POINT=$(($START_POINT + $PLATFORM_BACKUP_SIZE)) BACKUP_END_POINT=$END_POINT if [ $BACKUP_CREATED -eq 0 ] ; then - parted -s $ROOTFS_DISK mkpart primary ext4 ${START_POINT}MiB ${END_POINT}MiB + wlog "Creating platform backup partition of ${PLATFORM_BACKUP_SIZE}MiB from ${START_POINT}MiB to ${END_POINT}MiB." + exec_retry 5 0.5 "parted -s $rootfs_device mkpart primary ext4 ${START_POINT}MiB ${END_POINT}MiB" + [ $? -ne 0 ] && report_pre_failure_with_msg "ERROR: Partition creation failed!" fi START_POINT=$END_POINT END_POINT=$(($START_POINT + $EFI_SIZE)) - parted -s $ROOTFS_DISK mkpart primary fat32 ${START_POINT}MiB ${END_POINT}MiB + wlog "Creating EFI partition of ${EFI_SIZE}MiB from ${START_POINT}MiB to ${END_POINT}MiB." + exec_retry 5 0.5 "parted -s $rootfs_device mkpart primary fat32 ${START_POINT}MiB ${END_POINT}MiB" + [ $? -ne 0 ] && report_pre_failure_with_msg "ERROR: Partition creation failed!" cat<>/tmp/part-include part /boot/efi --fstype=efi --onpart=${ROOTFS_PART_PREFIX}2 @@ -132,13 +136,17 @@ EOF else BACKUP_PART=${ROOTFS_PART_PREFIX}2 BACKUP_PART_NO=2 - parted -s $ROOTFS_DISK mkpart primary 1MiB 2MiB + wlog "Creating 1MB BIOS GRUB partition from 1MiB to 2MiB." + exec_retry 5 0.5 "parted -s $rootfs_device mkpart primary 1MiB 2MiB" + [ $? -ne 0 ] && report_pre_failure_with_msg "ERROR: Partition creation failed!" START_POINT=2 END_POINT=$(($START_POINT + $PLATFORM_BACKUP_SIZE)) BACKUP_END_POINT=$END_POINT if [ $BACKUP_CREATED -eq 0 ] ; then - parted -s $ROOTFS_DISK mkpart primary ext4 ${START_POINT}MiB ${END_POINT}MiB + wlog "Creating platform backup partition of ${PLATFORM_BACKUP_SIZE}MiB from ${START_POINT}MiB to ${END_POINT}MiB." + exec_retry 5 0.5 "parted -s $rootfs_device mkpart primary ext4 ${START_POINT}MiB ${END_POINT}MiB" + [ $? -ne 0 ] && report_pre_failure_with_msg "ERROR: Partition creation failed!" fi cat<>/tmp/part-include part biosboot --asprimary --fstype=biosboot --onpart=${ROOTFS_PART_PREFIX}1 @@ -147,31 +155,47 @@ fi START_POINT=$END_POINT END_POINT=$(($START_POINT + $BOOT_SIZE)) -parted -s $ROOTFS_DISK mkpart primary ext4 ${START_POINT}MiB ${END_POINT}MiB +wlog "Creating boot partition of ${BOOT_SIZE}MiB from ${START_POINT}MiB to ${END_POINT}MiB." +exec_retry 5 0.5 "parted -s $rootfs_device mkpart primary ext4 ${START_POINT}MiB ${END_POINT}MiB" +[ $? -ne 0 ] && report_pre_failure_with_msg "ERROR: Partition creation failed!" START_POINT=$END_POINT END_POINT=$(($START_POINT + $ROOTFS_SIZE)) -parted -s $ROOTFS_DISK mkpart primary ext4 ${START_POINT}MiB ${END_POINT}MiB +wlog "Creating rootfs partition of ${ROOTFS_SIZE}MiB from ${START_POINT}MiB to ${END_POINT}MiB." +exec_retry 5 0.5 "parted -s $rootfs_device mkpart primary ext4 ${START_POINT}MiB ${END_POINT}MiB" +[ $? -ne 0 ] && report_pre_failure_with_msg "ERROR: Partition creation failed!" START_POINT=$END_POINT END_POINT=$(($START_POINT + $CGCS_PV_SIZE)) -parted -s $ROOTFS_DISK mkpart extended ${START_POINT}MiB ${END_POINT}MiB +wlog "Creating cgcs-vg partition of ${CGCS_PV_SIZE}MiB from ${START_POINT}MiB to ${END_POINT}MiB." +exec_retry 5 0.5 "parted -s $rootfs_device mkpart extended ${START_POINT}MiB ${END_POINT}MiB" +[ $? -ne 0 ] && report_pre_failure_with_msg "ERROR: Partition creation failed!" if [ $BACKUP_CREATED -ne 0 ] ; then BACKUP_CURRENT_SIZE=$(parted -s $BACKUP_PART unit MiB print | grep $BACKUP_PART | awk '{print $3}' | sed 's/[^C0-9]*//g') if [ $BACKUP_CURRENT_SIZE -lt $PLATFORM_BACKUP_SIZE ] ; then + wlog "Backup partition size is ${BACKUP_CURRENT_SIZE}MiB, resizing to ${PLATFORM_BACKUP_SIZE}MiB." # parted will throw an error about overlapping with the next partition if we don't do this BACKUP_END_POINT=$(($BACKUP_END_POINT - 1)).9 - parted -s $ROOTFS_DISK resizepart $BACKUP_PART_NO ${BACKUP_END_POINT}MiB - e2fsck -p -f $BACKUP_PART - resize2fs $BACKUP_PART + exec_retry 5 0.5 "parted -s $rootfs_device resizepart $BACKUP_PART_NO ${BACKUP_END_POINT}MiB" + [ $? -ne 0 ] && report_pre_failure_with_msg "ERROR: resize of platform backup partition failed!" + exec_retry 2 0.1 "e2fsck -p -f $BACKUP_PART" + [ $? -ne 0 ] && report_pre_failure_with_msg "ERROR: e2fsck failed on platform backup partition!" + exec_retry 2 1 "resize2fs $BACKUP_PART" + [ $? -ne 0 ] && report_pre_failure_with_msg "ERROR: Filed to resize ext4 fs of platform backup partition!" + elif [ $BACKUP_CURRENT_SIZE -gt $PLATFORM_BACKUP_SIZE ] ; then + report_pre_failure_with_msg "ERROR: Backup partition is ${BACKUP_CURRENT_SIZE}MiB expected size is less or equal to ${PLATFORM_BACKUP_SIZE}MiB." + else + wlog "Backup partition size is correct: ${PLATFORM_BACKUP_SIZE}MiB." fi + cat<>/tmp/part-include part /opt/platform-backup --fstype=ext4 --asprimary --noformat --onpart=$BACKUP_PART --fsoptions="$ROOTFS_OPTIONS" EOF else cat</tmp/backup-guid-change.sh -flock $ROOTFS_DISK sgdisk --change-name=${BACKUP_PART_NO}:"${BACKUP_PART_LABEL}" --typecode=${BACKUP_PART_NO}:"${BACKUP_PART_GUID}" $ROOTFS_DISK +echo "\$(date '+%Y-%m-%d %H:%M:%S.%3N') - Updating backup partition GUID." +flock $rootfs_device sgdisk --change-name=${BACKUP_PART_NO}:"${BACKUP_PART_LABEL}" --typecode=${BACKUP_PART_NO}:"${BACKUP_PART_GUID}" $rootfs_device || exit 1 EOF cat<>/tmp/part-include @@ -188,5 +212,3 @@ logvol /scratch --fstype=ext4 --vgname=cgts-vg --size=$SCRATCH_VOL_SIZE --name=s part / --fstype=ext4 --asprimary --onpart=${ROOTFS_PART_PREFIX}4 --fsoptions="$ROOTFS_OPTIONS" EOF -%end - diff --git a/bsp-files/kickstarts/pre_disk_controller.cfg b/bsp-files/kickstarts/pre_disk_controller.cfg index 7ab8e7ed..1d246027 100755 --- a/bsp-files/kickstarts/pre_disk_controller.cfg +++ b/bsp-files/kickstarts/pre_disk_controller.cfg @@ -23,12 +23,16 @@ if [ -d /sys/firmware/efi ] ; then END_POINT=$(($START_POINT + $PLATFORM_BACKUP_SIZE)) BACKUP_END_POINT=$END_POINT if [ $BACKUP_CREATED -eq 0 ] ; then - parted -s $ROOTFS_DISK mkpart primary ext4 ${START_POINT}MiB ${END_POINT}MiB + wlog "Creating platform backup partition of ${PLATFORM_BACKUP_SIZE}MiB from ${START_POINT}MiB to ${END_POINT}MiB." + exec_retry 5 0.5 "parted -s $rootfs_device mkpart primary ext4 ${START_POINT}MiB ${END_POINT}MiB" + [ $? -ne 0 ] && report_pre_failure_with_msg "ERROR: Partition creation failed!" fi START_POINT=$END_POINT END_POINT=$(($START_POINT + $EFI_SIZE)) - parted -s $ROOTFS_DISK mkpart primary fat32 ${START_POINT}MiB ${END_POINT}MiB + wlog "Creating EFI partition of ${EFI_SIZE}MiB from ${START_POINT}MiB to ${END_POINT}MiB." + exec_retry 5 0.5 "parted -s $rootfs_device mkpart primary fat32 ${START_POINT}MiB ${END_POINT}MiB" + [ $? -ne 0 ] && report_pre_failure_with_msg "ERROR: Partition creation failed!" cat<>/tmp/part-include part /boot/efi --fstype=efi --onpart=${ROOTFS_PART_PREFIX}2 @@ -36,13 +40,17 @@ EOF else BACKUP_PART=${ROOTFS_PART_PREFIX}2 BACKUP_PART_NO=2 - parted -s $ROOTFS_DISK mkpart primary 1MiB 2MiB + wlog "Creating 1MB BIOS GRUB partition from 1MiB to 2MiB." + exec_retry 5 0.5 "parted -s $rootfs_device mkpart primary 1MiB 2MiB" + [ $? -ne 0 ] && report_pre_failure_with_msg "ERROR: Partition creation failed!" START_POINT=2 END_POINT=$(($START_POINT + $PLATFORM_BACKUP_SIZE)) BACKUP_END_POINT=$END_POINT if [ $BACKUP_CREATED -eq 0 ] ; then - parted -s $ROOTFS_DISK mkpart primary ext4 ${START_POINT}MiB ${END_POINT}MiB + wlog "Creating platform backup partition of ${PLATFORM_BACKUP_SIZE}MiB from ${START_POINT}MiB to ${END_POINT}MiB." + exec_retry 5 0.5 "parted -s $rootfs_device mkpart primary ext4 ${START_POINT}MiB ${END_POINT}MiB" + [ $? -ne 0 ] && report_pre_failure_with_msg "ERROR: Partition creation failed!" fi cat<>/tmp/part-include part biosboot --asprimary --fstype=biosboot --onpart=${ROOTFS_PART_PREFIX}1 @@ -51,30 +59,45 @@ fi START_POINT=$END_POINT END_POINT=$(($START_POINT + $BOOT_SIZE)) -parted -s $ROOTFS_DISK mkpart primary ext4 ${START_POINT}MiB ${END_POINT}MiB +wlog "Creating boot partition of ${BOOT_SIZE}MiB from ${START_POINT}MiB to ${END_POINT}MiB." +exec_retry 5 0.5 "parted -s $rootfs_device mkpart primary ext4 ${START_POINT}MiB ${END_POINT}MiB" +[ $? -ne 0 ] && report_pre_failure_with_msg "ERROR: Partition creation failed!" START_POINT=$END_POINT END_POINT=$(($START_POINT + $ROOTFS_SIZE)) -parted -s $ROOTFS_DISK mkpart primary ext4 ${START_POINT}MiB ${END_POINT}MiB +wlog "Creating rootfs partition of ${ROOTFS_SIZE}MiB from ${START_POINT}MiB to ${END_POINT}MiB." +exec_retry 5 0.5 "parted -s $rootfs_device mkpart primary ext4 ${START_POINT}MiB ${END_POINT}MiB" +[ $? -ne 0 ] && report_pre_failure_with_msg "ERROR: Partition creation failed!" START_POINT=$END_POINT -parted -s $ROOTFS_DISK mkpart extended ${START_POINT}MiB 100% +wlog "Creating cgcs-vg partition of ${CGCS_PV_SIZE}MiB from ${START_POINT}MiB to 100%." +exec_retry 5 0.5 "parted -s $rootfs_device mkpart extended ${START_POINT}MiB 100%" +[ $? -ne 0 ] && report_pre_failure_with_msg "ERROR: Partition creation failed!" if [ $BACKUP_CREATED -ne 0 ] ; then BACKUP_CURRENT_SIZE=$(parted -s $BACKUP_PART unit MiB print | grep $BACKUP_PART | awk '{print $3}' | sed 's/[^C0-9]*//g') if [ $BACKUP_CURRENT_SIZE -lt $PLATFORM_BACKUP_SIZE ] ; then + wlog "Backup partition size is ${BACKUP_CURRENT_SIZE}MiB, resizing to ${PLATFORM_BACKUP_SIZE}MiB." # parted will throw an error about overlapping with the next partition if we don't do this BACKUP_END_POINT=$(($BACKUP_END_POINT - 1)).9 - parted -s $ROOTFS_DISK resizepart $BACKUP_PART_NO ${BACKUP_END_POINT}MiB - e2fsck -p -f $BACKUP_PART - resize2fs $BACKUP_PART + exec_retry 5 0.5 "parted -s $rootfs_device resizepart $BACKUP_PART_NO ${BACKUP_END_POINT}MiB" + [ $? -ne 0 ] && report_pre_failure_with_msg "ERROR: resize of platform backup partition failed!" + exec_retry 2 0.1 "e2fsck -p -f $BACKUP_PART" + [ $? -ne 0 ] && report_pre_failure_with_msg "ERROR: e2fsck failed on platform backup partition!" + exec_retry 2 1 "resize2fs $BACKUP_PART" + [ $? -ne 0 ] && report_pre_failure_with_msg "ERROR: Filed to resize ext4 fs of platform backup partition!" + elif [ $BACKUP_CURRENT_SIZE -gt $PLATFORM_BACKUP_SIZE ] ; then + report_pre_failure_with_msg "ERROR: Backup partition is ${BACKUP_CURRENT_SIZE}MiB expected size is less or equal to ${PLATFORM_BACKUP_SIZE}MiB." + else + wlog "Backup partition size is correct: ${PLATFORM_BACKUP_SIZE}MiB." fi cat<>/tmp/part-include part /opt/platform-backup --fstype=ext4 --asprimary --noformat --onpart=$BACKUP_PART --fsoptions="$ROOTFS_OPTIONS" EOF else cat</tmp/backup-guid-change.sh -flock $ROOTFS_DISK sgdisk --change-name=${BACKUP_PART_NO}:"${BACKUP_PART_LABEL}" --typecode=${BACKUP_PART_NO}:"${BACKUP_PART_GUID}" $ROOTFS_DISK +echo "\$(date '+%Y-%m-%d %H:%M:%S.%3N') - Updating backup partition GUID." +flock $rootfs_device sgdisk --change-name=${BACKUP_PART_NO}:"${BACKUP_PART_LABEL}" --typecode=${BACKUP_PART_NO}:"${BACKUP_PART_GUID}" $rootfs_device || exit 1 EOF cat<>/tmp/part-include @@ -91,5 +114,3 @@ logvol /scratch --fstype=ext4 --vgname=cgts-vg --size=$SCRATCH_VOL_SIZE --name=s part / --fstype=ext4 --asprimary --onpart=${ROOTFS_PART_PREFIX}4 --fsoptions="$ROOTFS_OPTIONS" EOF -%end - diff --git a/bsp-files/kickstarts/pre_disk_setup_common.cfg b/bsp-files/kickstarts/pre_disk_setup_common.cfg index 44f66606..84f5394d 100644 --- a/bsp-files/kickstarts/pre_disk_setup_common.cfg +++ b/bsp-files/kickstarts/pre_disk_setup_common.cfg @@ -3,6 +3,9 @@ # Source common functions . /tmp/ks-functions.sh +wlog "ISO_DEV='$ISO_DEV'." +wlog "USB_DEV='$USB_DEV'." + # This is a really fancy way of finding the first usable disk for the # install and not stomping on the USB device if it comes up first @@ -17,13 +20,18 @@ if [ -z "$rootfs_device" ]; then rootfs_device=$(get_disk_dev) fi -# Convert to by-path +# Get root and boot devices orig_rootfs_device=$rootfs_device -rootfs_device=$(get_by_path $rootfs_device) +by_path_rootfs_device=$(get_by_path $rootfs_device) +rootfs_device=$(get_disk $by_path_rootfs_device) +wlog "Found rootfs $orig_rootfs_device on: $by_path_rootfs_device->$rootfs_device." orig_boot_device=$boot_device -boot_device=$(get_by_path $boot_device) +by_path_boot_device=$(get_by_path $boot_device) +boot_device=$(get_disk $by_path_boot_device) +wlog "Found boot $orig_boot_device on: $by_path_boot_device->$boot_device." +# Check if boot and rootfs devices are valid if [ ! -e "$rootfs_device" -o ! -e "$boot_device" ] ; then # Touch this file to prevent Anaconda from dying an ungraceful death touch /tmp/part-include @@ -31,9 +39,50 @@ if [ ! -e "$rootfs_device" -o ! -e "$boot_device" ] ; then report_pre_failure_with_msg "ERROR: Specified installation ($orig_rootfs_device) or boot ($orig_boot_device) device is invalid." fi +# Get all block devices of type disk in the system. This includes solid +# state devices. +# Note: /dev/* are managed by kernel tmpdevfs while links in /dev/disk/by-path/ +# are managed by udev which updates them asynchronously so we should avoid using +# them while performing partition operations. +STOR_DEVS="" +wlog "Detected storage devices:" +for f in /dev/disk/by-path/*; do + dev=$(readlink -f $f) + exec_retry 2 0.5 "lsblk --nodeps --pairs $dev" | grep -q 'TYPE="disk"' + if [ $? -eq 0 ] + then + STOR_DEVS="$STOR_DEVS $dev" + wlog " ${f}->${dev}" + fi +done + +if [ -z "$STOR_DEVS" ] +then + report_pre_failure_with_msg "ERROR: No storage devices available." +fi + +# Lock all devices so that udev doesn't trigger a kernel partition table +# rescan that removes and recreates all /dev nodes for partitions on those +# devices. Since udev events are asynchronous this could lead to a case +# where /dev/ links for existing partitions are briefly missing. +# Missing /dev links leads to command execution failures. +STOR_DEV_FDS="$stdout" +for dev in $STOR_DEVS; do + exec {fd}>$dev || report_pre_failure_with_msg "ERROR: Error creating file descriptor for $dev." + flock -n "$fd" || report_pre_failure_with_msg "ERROR: Can't get a lock on fd $fd of device $dev." + STOR_DEV_FDS="$STOR_DEV_FDS $fd" +done + +# Log info about system state at beginning of partitioning operation +for dev in $STOR_DEVS; do + wlog "Initial partition table for $dev is:" + parted -s $dev unit mib print +done + # Ensure specified device is not a USB drive udevadm info --query=property --name=$rootfs_device |grep -q '^ID_BUS=usb' || \ udevadm info --query=property --name=$boot_device |grep -q '^ID_BUS=usb' + if [ $? -eq 0 ]; then # Touch this file to prevent Anaconda from dying an ungraceful death touch /tmp/part-include @@ -42,30 +91,48 @@ if [ $? -eq 0 ]; then fi # Deactivate existing volume groups to avoid Anaconda issues with pre-existing groups -vgs --noheadings -o vg_name | xargs --no-run-if-empty -n 1 vgchange -an +vgs=$(exec_no_fds "$STOR_DEV_FDS" "vgs --noheadings -o vg_name") +for vg in $vgs; do + wlog "Disabling $vg." + exec_no_fds "$STOR_DEV_FDS" "vgchange -an $vg" 5 0.5 + [ $? -ne 0 ] && report_pre_failure_with_msg "ERROR: Failed to disable $vg." +done # Remove the volume groups that have physical volumes on the root disk -for vg in $(vgs --noheadings -o vg_name); do - pvs --select "vg_name=$vg" --noheadings -o pv_name | grep -q "$(get_disk $rootfs_device)" +for vg in $(exec_no_fds "$STOR_DEV_FDS" "vgs --noheadings -o vg_name"); do + exec_no_fds "$STOR_DEV_FDS" "pvs --select \"vg_name=$vg\" --noheadings -o pv_name" | grep -q "$rootfs_device" if [ $? -ne 0 ]; then + wlog "Found $vg with no PV on rootfs, ignoring." continue fi - echo "Removing vg $vg" - lvremove --force $vg - pvs --select "vg_name=$vg" --noheadings -o pv_name | xargs --no-run-if-empty pvremove --force --force --yes - vgs --select "vg_name=$vg" --noheadings -o vg_name | xargs --no-run-if-empty vgremove --force + wlog "Removing LVs on $vg." + exec_no_fds "$STOR_DEV_FDS" "lvremove --force $vg" 5 0.5 || wlog "WARNING: Failed to remove lvs on $vg." + pvs=$(exec_no_fds "$STOR_DEV_FDS" "pvs --select \"vg_name=$vg\" --noheadings -o pv_name") + wlog "VG $vg has PVs: $(echo $pvs), removing them." + for pv in $pvs; do + wlog "Removing PV $pv." + exec_no_fds "$STOR_DEV_FDS" "pvremove --force --force --yes $pv" 5 0.5 + [ $? -ne 0 ] && report_pre_failure_with_msg "ERROR: Failed to remove PV." + done + # VG should no longer be present + vg_check=$(exec_no_fds "$STOR_DEV_FDS" "vgs --select \"vg_name=$vg\" --noheadings -o vg_name") + if [ -n "$vg_check" ]; then + wlog "WARNING: VG $vg is still present after removing PVs! Removing it by force." + exec_no_fds "$STOR_DEV_FDS" "vgremove --force $vg" 5 0.5 + [ $? -ne 0 ] && report_pre_failure_with_msg "ERROR: Failed to remove VG." + fi done ONLYUSE_HDD="" part_type_guid_str="Partition GUID code" if [ "$(curl -sf http://pxecontroller:6385/v1/upgrade/$(hostname)/in_upgrade 2>/dev/null)" = "true" ]; then # In an upgrade, only wipe the disk with the rootfs and boot partition - echo "In upgrade, wiping only $rootfs_device" - WIPE_HDD="$(get_disk $rootfs_device)" - ONLYUSE_HDD="$(basename $(get_disk $rootfs_device))" - if [ "$(get_disk $rootfs_device)" != "$(get_disk $boot_device)" ]; then - WIPE_HDD="$WIPE_HDD,$(get_disk $boot_device)" - ONLYUSE_HDD="$ONLYUSE_HDD,$(basename $(get_disk $boot_device))" + wlog "In upgrade, wiping only $rootfs_device" + WIPE_HDD=$rootfs_device + ONLYUSE_HDD="$(basename $rootfs_device)" + if [ "$rootfs_device" != "$boot_device" ]; then + WIPE_HDD="$WIPE_HDD,$boot_device" + ONLYUSE_HDD="$ONLYUSE_HDD,$(basename $boot_device)" fi else # Make a list of all the hard drives that are to be wiped @@ -75,22 +142,15 @@ else # Check if we wipe OSDs if [ "$(curl -sf http://pxecontroller:6385/v1/ihosts/wipe_osds 2>/dev/null)" = "true" ]; then - echo "Wipe OSD data." + wlog "Wipe OSD data." WIPE_CEPH_OSDS="true" else - echo "Skip Ceph OSD data wipe." + wlog "Skip Ceph OSD data wipe." WIPE_CEPH_OSDS="false" fi - for f in /dev/disk/by-path/* + for dev in $STOR_DEVS do - dev=$(readlink -f $f) - lsblk --nodeps --pairs $dev | grep -q 'TYPE="disk"' - if [ $? -ne 0 ] - then - continue - fi - # Avoid wiping USB drives udevadm info --query=property --name=$dev |grep -q '^ID_BUS=usb' && continue @@ -101,10 +161,10 @@ else # Scanning the partitions looking for CEPH OSDs and # skipping any disk found with such partitions for part_number in "${part_numbers[@]}"; do - sgdisk_part_info=$(flock $dev sgdisk -i $part_number $dev) + sgdisk_part_info=$(sgdisk -i $part_number $dev) part_type_guid=$(echo "$sgdisk_part_info" | grep "$part_type_guid_str" | awk '{print $4;}') if [ "$part_type_guid" == $CEPH_OSD_GUID ]; then - echo "OSD found on $dev, skipping wipe" + wlog "OSD found on $dev, skipping wipe" wipe_dev="false" break fi @@ -124,14 +184,13 @@ else fi fi done - echo "Not in upgrade, wiping disks: $WIPE_HDD" + wlog "Not in upgrade, wiping disks: $WIPE_HDD" fi -ROOTFS_DISK=$(get_disk $rootfs_device) -ROOTFS_PART_PREFIX=$ROOTFS_DISK +ROOTFS_PART_PREFIX=$rootfs_device #check if disk is nvme -case $ROOTFS_DISK in +case $rootfs_device in *"nvme"*) ROOTFS_PART_PREFIX=${ROOTFS_PART_PREFIX}p ;; @@ -159,28 +218,27 @@ do part=${dev}p${part_number} ;; esac - if [ "$dev" == "$ROOTFS_DISK" ]; then - sgdisk_part_info=$(flock $dev sgdisk -i $part_number $dev) + if [ "$dev" == "$rootfs_device" ]; then + sgdisk_part_info=$(sgdisk -i $part_number $dev) part_type_guid=$(echo "$sgdisk_part_info" | grep "$part_type_guid_str" | awk '{print $4;}') - part_fstype=$(blkid -s TYPE -o value $part) + part_fstype=$(exec_retry 5 0.5 "blkid -s TYPE -o value $part") if [ "$part_type_guid" == $BACKUP_PART_GUID -a "${part_fstype}" == "ext4" ]; then - echo "Skipping wipe backup partition $part" + wlog "Skipping wipe backup partition $part" BACKUP_CREATED=1 continue fi fi - echo "Wiping partition $part" + wlog "Wiping partition $part" dd if=/dev/zero of=$part bs=512 count=34 dd if=/dev/zero of=$part bs=512 count=34 seek=$((`blockdev --getsz $part` - 34)) - parted -s $dev rm $part_number + exec_retry 5 0.5 "parted -s $dev rm $part_number" # LP 1876374: On some nvme systems udev doesn't correctly remove the # links to the deleted partitions from /dev/nvme* causing them to be # seen as non block devices. - sleep 0.3 # Wait for udev to settle - rm -f $part # Delete remaining /dev node leftover + exec_retry 5 0.3 "rm -f $part" # Delete remaining /dev node leftover done - if [ $BACKUP_CREATED -eq 0 -o "$dev" != "$ROOTFS_DISK" ]; then - echo "Creating disk label for $dev" + if [ $BACKUP_CREATED -eq 0 -o "$dev" != "$rootfs_device" ]; then + wlog "Creating disk label for $dev" parted -s $dev mktable gpt fi @@ -190,10 +248,10 @@ done # in an upgrade where we're not wiping all disks. # If we ever create other volume groups from kickstart in the future, # include them in this search as well. -partitions=$(pvs --select 'vg_name=cgts-vg' -o pv_name --noheading | grep -v '\[unknown\]') +partitions=$(exec_no_fds "$STOR_DEV_FDS" "pvs --select 'vg_name=cgts-vg' -o pv_name --noheading" | grep -v '\[unknown\]') for p in $partitions do - echo "Pre-wiping $p from kickstart (cgts-vg present)" + wlog "Pre-wiping $p from kickstart (cgts-vg present)" dd if=/dev/zero of=$p bs=512 count=34 dd if=/dev/zero of=$p bs=512 count=34 seek=$((`blockdev --getsz $p` - 34)) done diff --git a/bsp-files/kickstarts/pre_disk_setup_tail.cfg b/bsp-files/kickstarts/pre_disk_setup_tail.cfg new file mode 100644 index 00000000..f11cdd37 --- /dev/null +++ b/bsp-files/kickstarts/pre_disk_setup_tail.cfg @@ -0,0 +1,21 @@ + +# Log info about system state at end of partitioning operation. +for dev in $STOR_DEVS; do + wlog "Partition table at end of script for $dev is:" + parted -s $dev unit mib print +done + +# Close all FDs and wait for udev to reshuffle all partitions. +wlog "Releasing storage device locks and FDs." +for fd in $STOR_DEV_FDS +do + flock -u "$fd" + exec {fd}>&- +done +sleep 2 +udevadm settle || report_pre_failure_with_msg "ERROR: udevadm settle failed!" + +# Rescan LVM cache to avoid warnings for VGs that were recreated. +pvscan --cache + +%end diff --git a/bsp-files/kickstarts/pre_disk_storage.cfg b/bsp-files/kickstarts/pre_disk_storage.cfg index d3bf93d2..6e89db1c 100755 --- a/bsp-files/kickstarts/pre_disk_storage.cfg +++ b/bsp-files/kickstarts/pre_disk_storage.cfg @@ -1,5 +1,5 @@ -sz=$(blockdev --getsize64 $(get_disk $rootfs_device)) +sz=$(blockdev --getsize64 $rootfs_device) if [ $sz -le $((90*$gb)) ] ; then LOG_VOL_SIZE=4000 SCRATCH_VOL_SIZE=4000 @@ -23,23 +23,21 @@ EOF if [ -d /sys/firmware/efi ] ; then cat<>/tmp/part-include -part /boot/efi --fstype=efi --size=300 --ondrive=$(get_disk $boot_device) +part /boot/efi --fstype=efi --size=300 --ondrive=$boot_device EOF else cat<>/tmp/part-include -part biosboot --asprimary --fstype=biosboot --size=1 --ondrive=$(get_disk $boot_device) +part biosboot --asprimary --fstype=biosboot --size=1 --ondrive=$boot_device EOF fi cat<>/tmp/part-include -part /boot --fstype=ext4 --asprimary --size=500 --ondrive=$(get_disk $rootfs_device) --fsoptions="$ROOTFS_OPTIONS" -part pv.253004 --grow --asprimary --size=500 --ondrive=$(get_disk $rootfs_device) +part /boot --fstype=ext4 --asprimary --size=500 --ondrive=$rootfs_device --fsoptions="$ROOTFS_OPTIONS" +part pv.253004 --grow --asprimary --size=500 --ondrive=$rootfs_device volgroup cgts-vg --pesize=32768 pv.253004 logvol /var/log --fstype=ext4 --vgname=cgts-vg --size=$LOG_VOL_SIZE --name=log-lv logvol /scratch --fstype=ext4 --vgname=cgts-vg --size=$SCRATCH_VOL_SIZE --name=scratch-lv -part / --fstype=ext4 --asprimary --size=$ROOTFS_SIZE --ondrive=$(get_disk $rootfs_device) --fsoptions="$ROOTFS_OPTIONS" +part / --fstype=ext4 --asprimary --size=$ROOTFS_SIZE --ondrive=$rootfs_device --fsoptions="$ROOTFS_OPTIONS" EOF -%end - diff --git a/bsp-files/kickstarts/pre_disk_worker.cfg b/bsp-files/kickstarts/pre_disk_worker.cfg index 5a408f5a..d051b530 100755 --- a/bsp-files/kickstarts/pre_disk_worker.cfg +++ b/bsp-files/kickstarts/pre_disk_worker.cfg @@ -15,7 +15,7 @@ BOOT_VOL_SIZE=500 ## 69648/1024=68.01. CGTS_PV_SIZE=69*1024=70656. CGTS_PV_SIZE=70656 -sz=$(blockdev --getsize64 $(get_disk $rootfs_device)) +sz=$(blockdev --getsize64 $rootfs_device) if [ $sz -le $((80*$gb)) ] ; then ## Less than 80GB use a 10GB root partition ROOTFS_SIZE=10000 @@ -37,23 +37,21 @@ EOF if [ -d /sys/firmware/efi ] ; then cat<>/tmp/part-include -part /boot/efi --fstype=efi --size=300 --ondrive=$(get_disk $boot_device) +part /boot/efi --fstype=efi --size=300 --ondrive=$boot_device EOF else cat<>/tmp/part-include -part biosboot --asprimary --fstype=biosboot --size=1 --ondrive=$(get_disk $boot_device) +part biosboot --asprimary --fstype=biosboot --size=1 --ondrive=$boot_device EOF fi cat<>/tmp/part-include -part /boot --fstype=ext4 --asprimary --size=$BOOT_VOL_SIZE --ondrive=$(get_disk $rootfs_device) --fsoptions="$ROOTFS_OPTIONS" -part pv.253004 --asprimary --size=$CGTS_PV_SIZE --ondrive=$(get_disk $rootfs_device) +part /boot --fstype=ext4 --asprimary --size=$BOOT_VOL_SIZE --ondrive=$rootfs_device --fsoptions="$ROOTFS_OPTIONS" +part pv.253004 --asprimary --size=$CGTS_PV_SIZE --ondrive=$rootfs_device volgroup cgts-vg --pesize=32768 pv.253004 logvol /var/log --fstype=ext4 --vgname=cgts-vg --size=$LOG_VOL_SIZE --name=log-lv logvol /scratch --fstype=ext4 --vgname=cgts-vg --size=$SCRATCH_VOL_SIZE --name=scratch-lv -part / --fstype=ext4 --asprimary --size=$ROOTFS_SIZE --ondrive=$(get_disk $rootfs_device) --fsoptions="$ROOTFS_OPTIONS" +part / --fstype=ext4 --asprimary --size=$ROOTFS_SIZE --ondrive=$rootfs_device --fsoptions="$ROOTFS_OPTIONS" EOF -%end -