Add a timeout for the image build
A builder thread can wedge if the build process wedges. Add a timeout to the subprocess. Since it was the call to readline() that would block, we change the process to have DIB write directly to the log. This allows us to set a timeout in the Popen.wait() call. And we kill the dib subprocess, as well. The timeout value can be controlled in the diskimage configuration and defaults to 8 hours. Change-Id: I188e8a74dc39b55a4b50ade5c1a96832fea76a7d
This commit is contained in:
parent
7fd6411dac
commit
7225354ec0
@ -277,6 +277,12 @@ Options
|
||||
Specifies the distro to be used as a base image to build the image using
|
||||
diskimage-builder.
|
||||
|
||||
.. attr:: build-timeout
|
||||
:type: int
|
||||
|
||||
How long (in seconds) to wait for the diskimage build before giving up.
|
||||
The default is 8 hours.
|
||||
|
||||
.. attr:: elements
|
||||
:type: list
|
||||
|
||||
|
@ -739,21 +739,28 @@ class BuildWorker(BaseWorker):
|
||||
if 'qcow2' in img_types:
|
||||
qemu_img_options = DEFAULT_QEMU_IMAGE_COMPAT_OPTIONS
|
||||
|
||||
cmd = ('%s -x -t %s --checksum --no-tmpfs %s -o %s %s' %
|
||||
log_fn = self._getBuildLog(diskimage.name, build_id)
|
||||
|
||||
cmd = ('%s -x -t %s --checksum --no-tmpfs %s -o %s --logfile %s %s' %
|
||||
(self.dib_cmd, img_types, qemu_img_options, filename,
|
||||
img_elements))
|
||||
log_fn, img_elements))
|
||||
|
||||
self._pruneBuildLogs(diskimage.name)
|
||||
log_fn = self._getBuildLog(diskimage.name, build_id)
|
||||
|
||||
self.log.info('Running %s' % (cmd,))
|
||||
self.log.info('Logging to %s' % (log_fn,))
|
||||
|
||||
start_time = time.monotonic()
|
||||
|
||||
# We used to use readline() on stdout to output the lines to the
|
||||
# build log. Unfortunately, this would block as long as the process
|
||||
# ran (with no easy way to timeout the read) and wedge the builder.
|
||||
# Now we use --logfile option to the dib command and set a timeout
|
||||
# on the wait() call to prevent the wedge.
|
||||
did_timeout = False
|
||||
try:
|
||||
p = subprocess.Popen(
|
||||
shlex.split(cmd),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
env=env)
|
||||
except OSError as e:
|
||||
@ -761,17 +768,20 @@ class BuildWorker(BaseWorker):
|
||||
"Failed to exec '%s'. Error: '%s'" % (cmd, e.strerror)
|
||||
)
|
||||
|
||||
with open(log_fn, 'wb') as log:
|
||||
while True:
|
||||
ln = p.stdout.readline()
|
||||
log.write(ln)
|
||||
log.flush()
|
||||
if not ln:
|
||||
break
|
||||
|
||||
rc = p.wait()
|
||||
m = "Exit code: %s\n" % rc
|
||||
log.write(m.encode('utf8'))
|
||||
try:
|
||||
rc = p.wait(timeout=diskimage.build_timeout)
|
||||
except subprocess.TimeoutExpired:
|
||||
p.kill()
|
||||
did_timeout = True
|
||||
rc = 1
|
||||
self.log.error(
|
||||
"Build timeout for image %s, build %s (log: %s)",
|
||||
diskimage.name, build_id, log_fn)
|
||||
else:
|
||||
# Append return code to dib's log file
|
||||
with open(log_fn, 'ab') as log:
|
||||
m = "Exit code: %s\n" % rc
|
||||
log.write(m.encode('utf8'))
|
||||
|
||||
# It's possible the connection to the ZK cluster could have been
|
||||
# interrupted during the build. If so, wait for it to return.
|
||||
@ -796,9 +806,10 @@ class BuildWorker(BaseWorker):
|
||||
self.log.info("ZooKeeper lost while building %s" % diskimage.name)
|
||||
self._zk.resetLostFlag()
|
||||
build_data.state = zk.FAILED
|
||||
elif p.returncode:
|
||||
elif p.returncode or did_timeout:
|
||||
self.log.info(
|
||||
"DIB failed creating %s (%s)" % (diskimage.name, p.returncode))
|
||||
"DIB failed creating %s (%s) (timeout=%s)" % (
|
||||
diskimage.name, p.returncode, did_timeout))
|
||||
build_data.state = zk.FAILED
|
||||
else:
|
||||
self.log.info("DIB image %s is built" % diskimage.name)
|
||||
|
@ -44,6 +44,7 @@ class ConfigValidator:
|
||||
'rebuild-age': int,
|
||||
'env-vars': {str: str},
|
||||
'username': str,
|
||||
'build-timeout': int,
|
||||
}
|
||||
|
||||
webapp = {
|
||||
|
@ -118,6 +118,7 @@ class Config(ConfigValue):
|
||||
d.image_types = set(diskimage.get('formats', []))
|
||||
d.pause = bool(diskimage.get('pause', False))
|
||||
d.username = diskimage.get('username', 'zuul')
|
||||
d.build_timeout = diskimage.get('build-timeout', (8 * 60 * 60))
|
||||
self.diskimages[d.name] = d
|
||||
|
||||
def setSecureDiskimageEnv(self, diskimages, secure_config_path):
|
||||
@ -179,6 +180,7 @@ class DiskImage(ConfigValue):
|
||||
self.image_types = None
|
||||
self.pause = False
|
||||
self.username = None
|
||||
self.build_timeout = None
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, DiskImage):
|
||||
@ -189,7 +191,8 @@ class DiskImage(ConfigValue):
|
||||
other.env_vars == self.env_vars and
|
||||
other.image_types == self.image_types and
|
||||
other.pause == self.pause and
|
||||
other.username == self.username)
|
||||
other.username == self.username and
|
||||
other.build_timeout == self.build_timeout)
|
||||
return False
|
||||
|
||||
def __repr__(self):
|
||||
|
@ -410,20 +410,25 @@ class DBTestCase(BaseTestCase):
|
||||
time.sleep(1)
|
||||
self.wait_for_threads()
|
||||
|
||||
def waitForBuild(self, image_name, build_id):
|
||||
def waitForBuild(self, image_name, build_id, states=None):
|
||||
if states is None:
|
||||
states = (zk.READY,)
|
||||
|
||||
base = "-".join([image_name, build_id])
|
||||
while True:
|
||||
self.wait_for_threads()
|
||||
files = builder.DibImageFile.from_image_id(
|
||||
self._config_images_dir.path, base)
|
||||
if files:
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
while True:
|
||||
self.wait_for_threads()
|
||||
build = self.zk.getBuild(image_name, build_id)
|
||||
if build and build.state == zk.READY:
|
||||
if build and build.state in states:
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
# We should only expect a dib manifest with a successful build.
|
||||
while build.state == zk.READY:
|
||||
self.wait_for_threads()
|
||||
files = builder.DibImageFile.from_image_id(
|
||||
self._config_images_dir.path, base)
|
||||
if files:
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
|
@ -1,10 +1,49 @@
|
||||
#!/bin/bash
|
||||
|
||||
outfile=
|
||||
outtypes=("qcow2")
|
||||
|
||||
all_args=$*
|
||||
logfile=
|
||||
checksum=
|
||||
no_tmpfs=
|
||||
qemu_img_options=
|
||||
x=
|
||||
|
||||
TEMP=$(getopt -o xo:t: --long qemu-img-options:,no-tmpfs,checksum,logfile: -- "$@")
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Invalid option"
|
||||
exit 1
|
||||
fi
|
||||
eval set -- "$TEMP"
|
||||
while true ; do
|
||||
case "$1" in
|
||||
--checksum)
|
||||
checksum=1; shift 1;;
|
||||
--no-tmpfs)
|
||||
no_tmpfs=1; shift 1;;
|
||||
--qemu-img-options)
|
||||
qemu_img_options=$2; shift 2;;
|
||||
--logfile)
|
||||
logfile=$2; shift 2;;
|
||||
-o) outfile=$2; shift 2;;
|
||||
-t) IFS="," read -a outtypes <<< "$2"; shift 2;;
|
||||
-x) x=1; shift;;
|
||||
--) shift ; break ;;
|
||||
*) echo "Unknown option : $1"; exit 1;;
|
||||
esac
|
||||
done
|
||||
|
||||
# If --logfile was given, direct stdout to it, as well
|
||||
if [ ! -z "$logfile" ]; then
|
||||
exec > >(tee -a ${logfile})
|
||||
fi
|
||||
|
||||
echo "*** fake-image-create: start"
|
||||
|
||||
echo "arguments:"
|
||||
echo "----"
|
||||
echo $*
|
||||
echo "$all_args"
|
||||
echo "----"
|
||||
|
||||
if [[ "${SHOULD_FAIL}" == 'true' ]]; then
|
||||
@ -38,30 +77,21 @@ if [[ "${BASE_IMAGE_FILE}" != "Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
outfile=
|
||||
outtypes=("qcow2")
|
||||
|
||||
TEMP=$(getopt -o xo:t: --long qemu-img-options:,no-tmpfs,checksum -- "$@")
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Invalid option"
|
||||
exit 1
|
||||
if [ ! -z "$logfile" ]; then
|
||||
echo " -> logfile: $logfile"
|
||||
fi
|
||||
if [ ! -z "$checksum" ]; then
|
||||
echo " -> set --checksum"
|
||||
fi
|
||||
if [ ! -z "$no_tmpfs" ]; then
|
||||
echo " -> set --no-tmpfs"
|
||||
fi
|
||||
if [ ! -z "$qemu_img_options" ]; then
|
||||
echo " -> qemu-img-options: $qemu_img_options"
|
||||
fi
|
||||
if [ ! -z "$x" ]; then
|
||||
echo " -> debugging enabled"
|
||||
fi
|
||||
eval set -- "$TEMP"
|
||||
while true ; do
|
||||
case "$1" in
|
||||
--checksum)
|
||||
echo " -> set --checksum"; shift 1;;
|
||||
--no-tmpfs)
|
||||
echo " -> set --no-tmpfs"; shift 1;;
|
||||
--qemu-img-options)
|
||||
echo " -> qemu-img-options: $2"; shift 2;;
|
||||
-o) outfile=$2; shift 2;;
|
||||
-t) IFS="," read -a outtypes <<< "$2"; shift 2;;
|
||||
-x) echo " -> debugging enabled"; shift;;
|
||||
--) shift ; break ;;
|
||||
*) echo "Unknown option : $1"; exit 1;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ -z "$outfile" ]; then
|
||||
echo "No output file specified."
|
||||
|
@ -130,6 +130,7 @@ diskimages:
|
||||
- cache-devstack
|
||||
release: trusty
|
||||
rebuild-age: 3600
|
||||
build-timeout: 3600
|
||||
env-vars:
|
||||
TMPDIR: /opt/dib_tmp
|
||||
DIB_IMAGE_CACHE: /opt/dib_cache
|
||||
|
26
nodepool/tests/fixtures/diskimage_build_timeout.yaml
vendored
Normal file
26
nodepool/tests/fixtures/diskimage_build_timeout.yaml
vendored
Normal file
@ -0,0 +1,26 @@
|
||||
elements-dir: .
|
||||
images-dir: '{images_dir}'
|
||||
build-log-dir: '{build_log_dir}'
|
||||
|
||||
zookeeper-servers:
|
||||
- host: {zookeeper_host}
|
||||
port: {zookeeper_port}
|
||||
chroot: {zookeeper_chroot}
|
||||
|
||||
labels: []
|
||||
|
||||
providers: []
|
||||
|
||||
diskimages:
|
||||
- name: fake-image
|
||||
formats:
|
||||
- tar
|
||||
elements:
|
||||
- fedora
|
||||
- vm
|
||||
release: 21
|
||||
env-vars:
|
||||
TMPDIR: /opt/dib_tmp
|
||||
DIB_IMAGE_CACHE: /opt/dib_cache
|
||||
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
|
||||
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2
|
@ -16,6 +16,8 @@
|
||||
import os
|
||||
import uuid
|
||||
import fixtures
|
||||
import mock
|
||||
import subprocess
|
||||
|
||||
from nodepool import builder, exceptions, tests
|
||||
from nodepool.driver.fake import provider as fakeprovider
|
||||
@ -335,3 +337,10 @@ class TestNodePoolBuilder(tests.DBTestCase):
|
||||
|
||||
self.assertEqual(build_default._formats, ['qcow2'])
|
||||
self.assertEqual(build_vhd._formats, ['vhd'])
|
||||
|
||||
@mock.patch.object(subprocess.Popen, 'wait')
|
||||
def test_diskimage_build_timeout(self, mock_wait):
|
||||
mock_wait.side_effect = subprocess.TimeoutExpired('dib_cmd', 1)
|
||||
configfile = self.setup_config('diskimage_build_timeout.yaml')
|
||||
self.useBuilder(configfile, cleanup_interval=0)
|
||||
self.waitForBuild('fake-image', '0000000001', states=(zk.FAILED,))
|
||||
|
6
releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml
Normal file
6
releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml
Normal file
@ -0,0 +1,6 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
A new option (build-timeout) has been added to the builder diskimage
|
||||
configuration to control how long the builder should wait for image
|
||||
builds before giving up. The default is 8 hours.
|
Loading…
x
Reference in New Issue
Block a user