Revert "Revert "Add a timeout for the image build""
This reverts commit ccf40a462a
.
The previous version would not work properly when daemonized
because there was no stdout. This version maintains stdout and
uses select/poll with non-blocking stdout to capture the output
to a log file.
Depends-On: https://review.openstack.org/634266
Change-Id: I7f0617b91e071294fe6051d14475ead1d7df56b7
This commit is contained in:
parent
acf038aef4
commit
890ea4975e
@ -277,6 +277,12 @@ Options
|
|||||||
Specifies the distro to be used as a base image to build the image using
|
Specifies the distro to be used as a base image to build the image using
|
||||||
diskimage-builder.
|
diskimage-builder.
|
||||||
|
|
||||||
|
.. attr:: build-timeout
|
||||||
|
:type: int
|
||||||
|
|
||||||
|
How long (in seconds) to wait for the diskimage build before giving up.
|
||||||
|
The default is 8 hours.
|
||||||
|
|
||||||
.. attr:: elements
|
.. attr:: elements
|
||||||
:type: list
|
:type: list
|
||||||
|
|
||||||
|
@ -13,8 +13,10 @@
|
|||||||
# License for the specific language governing permissions and limitations
|
# License for the specific language governing permissions and limitations
|
||||||
# under the License.
|
# under the License.
|
||||||
|
|
||||||
|
import fcntl
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import select
|
||||||
import shutil
|
import shutil
|
||||||
import socket
|
import socket
|
||||||
import subprocess
|
import subprocess
|
||||||
@ -42,6 +44,9 @@ SUSPEND_WAIT_TIME = 30
|
|||||||
# so just hardcode it for all qcow2 building
|
# so just hardcode it for all qcow2 building
|
||||||
DEFAULT_QEMU_IMAGE_COMPAT_OPTIONS = "--qemu-img-options 'compat=0.10'"
|
DEFAULT_QEMU_IMAGE_COMPAT_OPTIONS = "--qemu-img-options 'compat=0.10'"
|
||||||
|
|
||||||
|
# DIB process polling timeout, in milliseconds
|
||||||
|
BUILD_PROCESS_POLL_TIMEOUT = 30 * 1000
|
||||||
|
|
||||||
|
|
||||||
class DibImageFile(object):
|
class DibImageFile(object):
|
||||||
'''
|
'''
|
||||||
@ -750,6 +755,7 @@ class BuildWorker(BaseWorker):
|
|||||||
self.log.info('Logging to %s' % (log_fn,))
|
self.log.info('Logging to %s' % (log_fn,))
|
||||||
|
|
||||||
start_time = time.monotonic()
|
start_time = time.monotonic()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
p = subprocess.Popen(
|
p = subprocess.Popen(
|
||||||
shlex.split(cmd),
|
shlex.split(cmd),
|
||||||
@ -761,15 +767,71 @@ class BuildWorker(BaseWorker):
|
|||||||
"Failed to exec '%s'. Error: '%s'" % (cmd, e.strerror)
|
"Failed to exec '%s'. Error: '%s'" % (cmd, e.strerror)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Make subprocess stdout non-blocking
|
||||||
|
fd = p.stdout.fileno()
|
||||||
|
fl = fcntl.fcntl(fd, fcntl.F_GETFL)
|
||||||
|
fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)
|
||||||
|
|
||||||
|
# Poll subprocess stdout for readability
|
||||||
|
r, w, e = select.select([fd], [], [], 0)
|
||||||
|
bitmask = (select.POLLIN | select.POLLHUP)
|
||||||
|
poll = select.poll()
|
||||||
|
poll.register(fd, bitmask)
|
||||||
|
|
||||||
|
rc = None
|
||||||
|
did_timeout = False
|
||||||
|
subprocess_done = False
|
||||||
|
|
||||||
|
def buildDidTimeout():
|
||||||
|
build_time = time.monotonic() - start_time
|
||||||
|
if build_time > diskimage.build_timeout:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
with open(log_fn, 'wb') as log:
|
with open(log_fn, 'wb') as log:
|
||||||
while True:
|
|
||||||
ln = p.stdout.readline()
|
# While the subprocess is running, we will loop through stdout
|
||||||
log.write(ln)
|
# events. If we can read data, write that out to the log file.
|
||||||
|
# If the subprocess completes, set the flag so we can exit and
|
||||||
|
# write the return code.
|
||||||
|
#
|
||||||
|
# We check for build timeouts in two places: after we have read
|
||||||
|
# all available data in the buffer (which will cause an early exit
|
||||||
|
# of the poll loop), and after a poll timeout. If we did not have
|
||||||
|
# a check after the reads, we would have to have a poll timeout
|
||||||
|
# to occur to check for a build timeout, which may not happen if
|
||||||
|
# there is constantly data available for reading.
|
||||||
|
|
||||||
|
while not subprocess_done:
|
||||||
|
for fd, event in poll.poll(BUILD_PROCESS_POLL_TIMEOUT):
|
||||||
|
|
||||||
|
# Data available for reading
|
||||||
|
if event & select.POLLIN:
|
||||||
|
data = p.stdout.read(1024)
|
||||||
|
while data:
|
||||||
|
log.write(data)
|
||||||
log.flush()
|
log.flush()
|
||||||
if not ln:
|
data = p.stdout.read(1024)
|
||||||
|
if buildDidTimeout():
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# Subprocess complete
|
||||||
|
elif event & select.POLLHUP:
|
||||||
|
subprocess_done = True
|
||||||
rc = p.wait()
|
rc = p.wait()
|
||||||
|
|
||||||
|
if not subprocess_done:
|
||||||
|
if buildDidTimeout():
|
||||||
|
did_timeout = True
|
||||||
|
rc = 1
|
||||||
|
self.log.error(
|
||||||
|
"Build timeout for image %s, build %s (log: %s)",
|
||||||
|
diskimage.name, build_id, log_fn)
|
||||||
|
p.kill()
|
||||||
|
break
|
||||||
|
|
||||||
|
# Subprocess finished, write return code
|
||||||
|
if not did_timeout:
|
||||||
m = "Exit code: %s\n" % rc
|
m = "Exit code: %s\n" % rc
|
||||||
log.write(m.encode('utf8'))
|
log.write(m.encode('utf8'))
|
||||||
|
|
||||||
@ -796,9 +858,10 @@ class BuildWorker(BaseWorker):
|
|||||||
self.log.info("ZooKeeper lost while building %s" % diskimage.name)
|
self.log.info("ZooKeeper lost while building %s" % diskimage.name)
|
||||||
self._zk.resetLostFlag()
|
self._zk.resetLostFlag()
|
||||||
build_data.state = zk.FAILED
|
build_data.state = zk.FAILED
|
||||||
elif p.returncode:
|
elif p.returncode or did_timeout:
|
||||||
self.log.info(
|
self.log.info(
|
||||||
"DIB failed creating %s (%s)" % (diskimage.name, p.returncode))
|
"DIB failed creating %s (%s) (timeout=%s)" % (
|
||||||
|
diskimage.name, p.returncode, did_timeout))
|
||||||
build_data.state = zk.FAILED
|
build_data.state = zk.FAILED
|
||||||
else:
|
else:
|
||||||
self.log.info("DIB image %s is built" % diskimage.name)
|
self.log.info("DIB image %s is built" % diskimage.name)
|
||||||
|
@ -44,6 +44,7 @@ class ConfigValidator:
|
|||||||
'rebuild-age': int,
|
'rebuild-age': int,
|
||||||
'env-vars': {str: str},
|
'env-vars': {str: str},
|
||||||
'username': str,
|
'username': str,
|
||||||
|
'build-timeout': int,
|
||||||
}
|
}
|
||||||
|
|
||||||
webapp = {
|
webapp = {
|
||||||
|
@ -118,6 +118,7 @@ class Config(ConfigValue):
|
|||||||
d.image_types = set(diskimage.get('formats', []))
|
d.image_types = set(diskimage.get('formats', []))
|
||||||
d.pause = bool(diskimage.get('pause', False))
|
d.pause = bool(diskimage.get('pause', False))
|
||||||
d.username = diskimage.get('username', 'zuul')
|
d.username = diskimage.get('username', 'zuul')
|
||||||
|
d.build_timeout = diskimage.get('build-timeout', (8 * 60 * 60))
|
||||||
self.diskimages[d.name] = d
|
self.diskimages[d.name] = d
|
||||||
|
|
||||||
def setSecureDiskimageEnv(self, diskimages, secure_config_path):
|
def setSecureDiskimageEnv(self, diskimages, secure_config_path):
|
||||||
@ -179,6 +180,7 @@ class DiskImage(ConfigValue):
|
|||||||
self.image_types = None
|
self.image_types = None
|
||||||
self.pause = False
|
self.pause = False
|
||||||
self.username = None
|
self.username = None
|
||||||
|
self.build_timeout = None
|
||||||
|
|
||||||
def __eq__(self, other):
|
def __eq__(self, other):
|
||||||
if isinstance(other, DiskImage):
|
if isinstance(other, DiskImage):
|
||||||
@ -189,7 +191,8 @@ class DiskImage(ConfigValue):
|
|||||||
other.env_vars == self.env_vars and
|
other.env_vars == self.env_vars and
|
||||||
other.image_types == self.image_types and
|
other.image_types == self.image_types and
|
||||||
other.pause == self.pause and
|
other.pause == self.pause and
|
||||||
other.username == self.username)
|
other.username == self.username and
|
||||||
|
other.build_timeout == self.build_timeout)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
@ -411,20 +411,25 @@ class DBTestCase(BaseTestCase):
|
|||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
self.wait_for_threads()
|
self.wait_for_threads()
|
||||||
|
|
||||||
def waitForBuild(self, image_name, build_id):
|
def waitForBuild(self, image_name, build_id, states=None):
|
||||||
|
if states is None:
|
||||||
|
states = (zk.READY,)
|
||||||
|
|
||||||
base = "-".join([image_name, build_id])
|
base = "-".join([image_name, build_id])
|
||||||
while True:
|
|
||||||
self.wait_for_threads()
|
|
||||||
files = builder.DibImageFile.from_image_id(
|
|
||||||
self._config_images_dir.path, base)
|
|
||||||
if files:
|
|
||||||
break
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
self.wait_for_threads()
|
self.wait_for_threads()
|
||||||
build = self.zk.getBuild(image_name, build_id)
|
build = self.zk.getBuild(image_name, build_id)
|
||||||
if build and build.state == zk.READY:
|
if build and build.state in states:
|
||||||
|
break
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
# We should only expect a dib manifest with a successful build.
|
||||||
|
while build.state == zk.READY:
|
||||||
|
self.wait_for_threads()
|
||||||
|
files = builder.DibImageFile.from_image_id(
|
||||||
|
self._config_images_dir.path, base)
|
||||||
|
if files:
|
||||||
break
|
break
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
|
@ -152,6 +152,7 @@ diskimages:
|
|||||||
- cache-devstack
|
- cache-devstack
|
||||||
release: trusty
|
release: trusty
|
||||||
rebuild-age: 3600
|
rebuild-age: 3600
|
||||||
|
build-timeout: 3600
|
||||||
env-vars:
|
env-vars:
|
||||||
TMPDIR: /opt/dib_tmp
|
TMPDIR: /opt/dib_tmp
|
||||||
DIB_IMAGE_CACHE: /opt/dib_cache
|
DIB_IMAGE_CACHE: /opt/dib_cache
|
||||||
|
27
nodepool/tests/fixtures/diskimage_build_timeout.yaml
vendored
Normal file
27
nodepool/tests/fixtures/diskimage_build_timeout.yaml
vendored
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
elements-dir: .
|
||||||
|
images-dir: '{images_dir}'
|
||||||
|
build-log-dir: '{build_log_dir}'
|
||||||
|
|
||||||
|
zookeeper-servers:
|
||||||
|
- host: {zookeeper_host}
|
||||||
|
port: {zookeeper_port}
|
||||||
|
chroot: {zookeeper_chroot}
|
||||||
|
|
||||||
|
labels: []
|
||||||
|
|
||||||
|
providers: []
|
||||||
|
|
||||||
|
diskimages:
|
||||||
|
- name: fake-image
|
||||||
|
build-timeout: 2
|
||||||
|
formats:
|
||||||
|
- tar
|
||||||
|
elements:
|
||||||
|
- fedora
|
||||||
|
- vm
|
||||||
|
release: 21
|
||||||
|
env-vars:
|
||||||
|
TMPDIR: /opt/dib_tmp
|
||||||
|
DIB_IMAGE_CACHE: /opt/dib_cache
|
||||||
|
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
|
||||||
|
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2
|
@ -16,6 +16,7 @@
|
|||||||
import os
|
import os
|
||||||
import uuid
|
import uuid
|
||||||
import fixtures
|
import fixtures
|
||||||
|
import mock
|
||||||
|
|
||||||
from nodepool import builder, exceptions, tests
|
from nodepool import builder, exceptions, tests
|
||||||
from nodepool.driver.fake import provider as fakeprovider
|
from nodepool.driver.fake import provider as fakeprovider
|
||||||
@ -335,3 +336,10 @@ class TestNodePoolBuilder(tests.DBTestCase):
|
|||||||
|
|
||||||
self.assertEqual(build_default._formats, ['qcow2'])
|
self.assertEqual(build_default._formats, ['qcow2'])
|
||||||
self.assertEqual(build_vhd._formats, ['vhd'])
|
self.assertEqual(build_vhd._formats, ['vhd'])
|
||||||
|
|
||||||
|
@mock.patch('select.poll')
|
||||||
|
def test_diskimage_build_timeout(self, mock_poll):
|
||||||
|
configfile = self.setup_config('diskimage_build_timeout.yaml')
|
||||||
|
builder.BUILD_PROCESS_POLL_TIMEOUT = 500
|
||||||
|
self.useBuilder(configfile, cleanup_interval=0)
|
||||||
|
self.waitForBuild('fake-image', '0000000001', states=(zk.FAILED,))
|
||||||
|
6
releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml
Normal file
6
releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
---
|
||||||
|
features:
|
||||||
|
- |
|
||||||
|
A new option (build-timeout) has been added to the builder diskimage
|
||||||
|
configuration to control how long the builder should wait for image
|
||||||
|
builds before giving up. The default is 8 hours.
|
Loading…
Reference in New Issue
Block a user