9a3fb29816
We use glean in the gate tests to enable the network. Because we are in a DHCP environment with the devstack environment, it's possible that glean didn't start but we still manage to get an address anyway via autoconfiguration. But the point of having glean is that in clouds without DHCP things work too. This is most common with new distributions, where glean matching may not have updated to configure things fully. We could replicate a non-DHCP environment, at some considerable initial cost of figuring out how to setup devstack to do that and then ongoing costs of having a fairly bespoke environment. A simpler path is to just check the glean services and make sure they started correctly. Change-Id: Idb8d67cd2ba6e8c5d7157177e9cfd6be7b99cacd
179 lines
6.3 KiB
Bash
Executable File
179 lines
6.3 KiB
Bash
Executable File
#!/bin/bash -ex
|
|
|
|
LOGDIR=/home/zuul/zuul-output/logs
|
|
|
|
# Set to indiciate an error return
|
|
RETURN=0
|
|
FAILURE_REASON=""
|
|
|
|
if [[ ${NODEPOOL_FUNCTIONAL_CHECK:-} == "installed" ]]; then
|
|
NODEPOOL_INSTALL=${NODEPOOL_INSTALL:-~/.venv}
|
|
NODEPOOL_CONFIG=${NODEPOOL_CONFIG:-/etc/nodepool/nodepool.yaml}
|
|
NODEPOOL="$NODEPOOL_INSTALL/bin/nodepool -c $NODEPOOL_CONFIG"
|
|
elif [[ ${NODEPOOL_FUNCTIONAL_CHECK:-} == "containers" ]]; then
|
|
NODEPOOL="docker exec nodepool_nodepool-launcher_1 nodepool"
|
|
else
|
|
echo "Running in unknown environment!"
|
|
exit 1
|
|
fi
|
|
|
|
cat > /tmp/ssh_wrapper <<EOF
|
|
#!/bin/bash -ex
|
|
sudo -H -u zuul ssh -o StrictHostKeyChecking=no -i $HOME/.ssh/id_nodepool root@\$@
|
|
|
|
EOF
|
|
sudo chmod 0755 /tmp/ssh_wrapper
|
|
|
|
function sshintonode {
|
|
name=$1
|
|
state='ready'
|
|
|
|
node=`$NODEPOOL list | grep $name | grep $state | cut -d '|' -f6 | tr -d ' '`
|
|
|
|
/tmp/ssh_wrapper $node ls /
|
|
|
|
# Check that the root partition grew on boot; it should be a 5GiB
|
|
# partition minus some space for the boot partition. However
|
|
# emperical evidence suggests there is some modulo maths going on,
|
|
# (possibly with alignment?) that means we can vary up to even
|
|
# 64MiB. Thus we choose an expected value that gives us enough
|
|
# slop to avoid false matches, but still indicates we resized up.
|
|
root_size=$(/tmp/ssh_wrapper $node -- lsblk -rbno SIZE /dev/vda1)
|
|
expected_root_size=$(( 5000000000 ))
|
|
if [[ $root_size -lt $expected_root_size ]]; then
|
|
echo "*** Root device does not appear to have grown: $root_size"
|
|
FAILURE_REASON="Root partition of $name does not appear to have grown: $root_size < $expected_root_size"
|
|
RETURN=1
|
|
fi
|
|
|
|
# Check we saw metadata deployed to the config-drive
|
|
/tmp/ssh_wrapper $node \
|
|
"dd status=none if=/dev/sr0 | tr -cd '[:print:]' | grep -q nodepool_devstack"
|
|
if [[ $? -ne 0 ]]; then
|
|
echo "*** Failed to find metadata in config-drive"
|
|
FAILURE_REASON="Failed to find meta-data in config-drive for $node"
|
|
RETURN=1
|
|
fi
|
|
|
|
# Debugging that we're seeing the right node
|
|
/tmp/ssh_wrapper $node -- "cat /etc/os-release; blkid"
|
|
|
|
# This ensures DIB setup the bootloader kernel arguments correctly
|
|
# by looking for the default console setup it does. In the past
|
|
# we have seen issues with bootloader installation for all sorts
|
|
# of reasons (our errors and upstream changes); but generally what
|
|
# has happened is that case grub has silently fallen-back to
|
|
# "guessing" from the running build-system what kernel flags to
|
|
# use.
|
|
#
|
|
# DIB images should be booting from a root device labeled
|
|
# "cloudimg-rootfs". In the gate, where you're *using* a
|
|
# DIB-image to build a DIB-image, it's /proc/cmdline contains
|
|
# "root=LABEL=cloudimg-rootfs" and a misconfigured grub can
|
|
# actually guess the correct root device. However, when this
|
|
# builds an image in production on a totally different host you
|
|
# get a non-booting, wrong image. Ergo, although this is mostly
|
|
# what we're interested in validating, this is not a reliable
|
|
# thing to test directly.
|
|
#
|
|
# So below, we probe for something else; the console setup that
|
|
# DIB will put in. If this is missing, it's an indication that
|
|
# the bootloader is not setting up the kernel arguments correctly.
|
|
kernel_cmd_line=$(/tmp/ssh_wrapper $node -- cat /proc/cmdline)
|
|
echo "Kernel command line: ${kernel_cmd_line}"
|
|
if [[ ! $kernel_cmd_line =~ 'console=tty0 console=ttyS0,115200' ]]; then
|
|
echo "*** Failed to find correct kernel boot flags"
|
|
FAILURE_REASON="Failed to find correct kernel boot flags $node"
|
|
RETURN=1
|
|
fi
|
|
|
|
# Ensure glean services have loaded
|
|
# The output is like:
|
|
# ---
|
|
# glean-early.service loaded active exited Early glean execution
|
|
# glean@ens3.service loaded active exited Glean for interface ens3 with NetworkManager
|
|
# glean@lo.service loaded active exited Glean for interface lo with NetworkManager
|
|
# ---
|
|
# So if we see anything other than 'loaded active exited' we have a problem.
|
|
glean_status=$(/tmp/ssh_wrapper $node -- "systemctl | egrep 'glean[@|-]' | { grep -v 'loaded active exited' || true; }")
|
|
if [[ ${glean_status} != '' ]]; then
|
|
echo "*** Glean not loaded correctly"
|
|
echo "*** saw: ${glean_status}"
|
|
FAILURE_REASON="Failed to start glean correctly"
|
|
RETURN=1
|
|
fi
|
|
}
|
|
|
|
function checknm {
|
|
name=$1
|
|
state='ready'
|
|
|
|
node=`$NODEPOOL list | grep $name | grep $state | cut -d '|' -f6 | tr -d ' '`
|
|
nm_output=$(/tmp/ssh_wrapper $node -- nmcli c)
|
|
|
|
# virtio device is eth0 on older, ens3 on newer
|
|
if [[ ! ${nm_output} =~ (eth0|ens3) ]]; then
|
|
echo "*** Failed to find interface in NetworkManager connections"
|
|
/tmp/ssh_wrapper $node -- nmcli c
|
|
/tmp/ssh_wrapper $node -- nmcli device
|
|
FAILURE_REASON="Failed to find interface in NetworkManager connections"
|
|
RETURN=1
|
|
fi
|
|
}
|
|
|
|
function waitforimage {
|
|
local name=$1
|
|
local state='ready'
|
|
local builds
|
|
|
|
while ! $NODEPOOL image-list | grep $name | grep $state; do
|
|
$NODEPOOL image-list > ${LOGDIR}/nodepool-image-list.txt
|
|
$NODEPOOL list --detail > ${LOGDIR}/nodepool-list.txt
|
|
|
|
builds=$(ls -l /var/log/nodepool/builds/ | grep $name | wc -l)
|
|
if [[ ${builds} -ge 4 ]]; then
|
|
echo "*** Build of $name failed at least 3 times, aborting"
|
|
exit 1
|
|
fi
|
|
sleep 10
|
|
done
|
|
}
|
|
|
|
function waitfornode {
|
|
name=$1
|
|
state='ready'
|
|
|
|
while ! $NODEPOOL list | grep $name | grep $state | grep "unlocked"; do
|
|
$NODEPOOL image-list > ${LOGDIR}/nodepool-image-list.txt
|
|
$NODEPOOL list --detail > ${LOGDIR}/nodepool-list.txt
|
|
sleep 10
|
|
done
|
|
}
|
|
|
|
# check that image built
|
|
waitforimage test-image
|
|
# check image was bootable
|
|
waitfornode test-image
|
|
# check ssh for root user
|
|
sshintonode test-image
|
|
# networkmanager check
|
|
# TODO(jeblair): This should not run in all cases; in fact, most of
|
|
# this checking should move into the dib repo
|
|
#checknm test-image
|
|
# userdata check
|
|
|
|
set -o errexit
|
|
# Show the built nodes
|
|
$NODEPOOL list
|
|
|
|
# Try to delete the nodes that were just built
|
|
$NODEPOOL delete --now 0000000000
|
|
|
|
# show the deleted nodes (and their replacements may be building)
|
|
$NODEPOOL list
|
|
|
|
if [[ -n "${FAILURE_REASON}" ]]; then
|
|
echo "${FAILURE_REASON}"
|
|
fi
|
|
exit $RETURN
|