Support deleting DIB images while builder is offline
We don't currently support deleting a DIB image while the builder that built it is offline. The reason for this is to ensure that we actually remove the files from disk on the builder. The mechanism is for all other builders to defer handling "DELETING" image nodes in ZK to the builder which built them. This can be problematic if the builder is offline for an extended period, or permanently. To address this case without compromising the original goal, we now let any builder delete the uploads and ZK nodes for a DIB build. Subsequently, every builder will now look for DIB manifest directories within its image-dir, and if it finds one that does not have a corresponding ZK node, it garbage collects that image from disk. Change-Id: I65efb31ca02cea4bcf7ef8f962a00b5263ccf69c
This commit is contained in:
parent
d01f19d762
commit
6386170914
@ -15,6 +15,7 @@
|
||||
import fcntl
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import select
|
||||
import shutil
|
||||
import socket
|
||||
@ -226,7 +227,7 @@ class CleanupWorker(BaseWorker):
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def deleteLocalBuild(images_dir, image, build, log):
|
||||
def deleteLocalBuild(images_dir, image_name, build_id, log):
|
||||
'''
|
||||
Remove expired image build from local disk.
|
||||
|
||||
@ -247,12 +248,12 @@ class CleanupWorker(BaseWorker):
|
||||
if e.errno != 2: # No such file or directory
|
||||
raise e
|
||||
|
||||
base = "-".join([image, build.id])
|
||||
base = "-".join([image_name, build_id])
|
||||
files = DibImageFile.from_image_id(images_dir, base)
|
||||
if not files:
|
||||
return
|
||||
|
||||
log.info("Doing cleanup for %s:%s" % (image, build.id))
|
||||
log.info("Doing cleanup for %s:%s" % (image_name, build_id))
|
||||
|
||||
manifest_dir = None
|
||||
|
||||
@ -271,9 +272,9 @@ class CleanupWorker(BaseWorker):
|
||||
if e.errno != 2: # No such file or directory
|
||||
raise e
|
||||
|
||||
def _deleteLocalBuild(self, image, build):
|
||||
def _deleteLocalBuild(self, image_name, build_id):
|
||||
CleanupWorker.deleteLocalBuild(
|
||||
self._config.images_dir, image, build, self.log)
|
||||
self._config.images_dir, image_name, build_id, self.log)
|
||||
|
||||
def _cleanupProvider(self, provider, image, build_id):
|
||||
all_uploads = self._zk.getUploads(image, build_id, provider.name)
|
||||
@ -377,6 +378,27 @@ class CleanupWorker(BaseWorker):
|
||||
except Exception:
|
||||
self.log.exception("Exception cleaning up image %s:", image)
|
||||
|
||||
for local_build in self._getLocalBuilds():
|
||||
try:
|
||||
image_name, build_id = local_build
|
||||
build = self._zk.getBuild(image_name, build_id)
|
||||
if not build or build.state == zk.DELETING:
|
||||
self.log.info("Deleting on-disk build: "
|
||||
"%s-%s", image_name, build_id)
|
||||
self._deleteLocalBuild(image_name, build_id)
|
||||
except Exception:
|
||||
self.log.exception("Exception cleaning up local build %s:",
|
||||
local_build)
|
||||
|
||||
MANIFEST_RE = re.compile(r'(.*)-(\d+)\.d')
|
||||
|
||||
def _getLocalBuilds(self):
|
||||
for entry in os.scandir(self._config.images_dir):
|
||||
m = self.MANIFEST_RE.match(entry.name)
|
||||
if m and entry.is_dir():
|
||||
image_name, build_id = m.groups()
|
||||
yield (image_name, build_id)
|
||||
|
||||
def _emitBuildRequestStats(self):
|
||||
'''Emit general build request stats
|
||||
|
||||
@ -395,15 +417,6 @@ class CleanupWorker(BaseWorker):
|
||||
pipeline.gauge(key, count)
|
||||
pipeline.send()
|
||||
|
||||
def _filterLocalBuilds(self, image, builds):
|
||||
'''Return the subset of builds that are local'''
|
||||
ret = []
|
||||
for build in builds:
|
||||
is_local = build.builder_id == self._builder_id
|
||||
if is_local:
|
||||
ret.append(build)
|
||||
return ret
|
||||
|
||||
def _cleanupCurrentProviderUploads(self, provider, image, build_id):
|
||||
'''
|
||||
Remove cruft from a current build.
|
||||
@ -453,21 +466,16 @@ class CleanupWorker(BaseWorker):
|
||||
# Get the list of all builds, then work from that so that we
|
||||
# have a consistent view of the data.
|
||||
all_builds = self._zk.getBuilds(image)
|
||||
builds_to_keep = set([b for b in sorted(all_builds, reverse=True,
|
||||
key=lambda y: y.state_time)
|
||||
if b.state == zk.READY][:2])
|
||||
local_builds = set(self._filterLocalBuilds(image, all_builds))
|
||||
diskimage = self._config.diskimages.get(image)
|
||||
if not diskimage and not local_builds:
|
||||
# This builder is not configured to build this image and was not
|
||||
# responsible for this image build so ignore it.
|
||||
return
|
||||
# Remove any local builds that are not in use.
|
||||
if not diskimage or (diskimage and not diskimage.image_types):
|
||||
builds_to_keep -= local_builds
|
||||
builds_to_keep = set()
|
||||
# TODO(jeblair): When all builds for an image which is not
|
||||
# in use are deleted, the image znode should be deleted as
|
||||
# well.
|
||||
else:
|
||||
builds_to_keep = set([b for b in sorted(all_builds, reverse=True,
|
||||
key=lambda y: y.state_time)
|
||||
if b.state == zk.READY][:2])
|
||||
|
||||
for build in all_builds:
|
||||
# Start by deleting any uploads that are no longer needed
|
||||
@ -518,13 +526,13 @@ class CleanupWorker(BaseWorker):
|
||||
for p in self._zk.getBuildProviders(image, build.id):
|
||||
uploads += self._zk.getUploads(image, build.id, p)
|
||||
|
||||
# If we own this build, delete the local DIB files as soon as all
|
||||
# provider uploads are in a deleting state. This prevents us from
|
||||
# keeping local files around while we wait on clouds to remove
|
||||
# the image on their side (which can be very slow).
|
||||
# Mark the image as deleting (which will permit deleting
|
||||
# local DIB files) as soon as all provider uploads are in
|
||||
# a deleting state. This prevents us from keeping local
|
||||
# files around while we wait on clouds to remove the image
|
||||
# on their side (which can be very slow).
|
||||
all_deleting = all(map(lambda x: x.state == zk.DELETING, uploads))
|
||||
is_local = build.builder_id == self._builder_id
|
||||
if (not uploads or all_deleting) and is_local:
|
||||
if (not uploads or all_deleting):
|
||||
# If we got here, it's either an explicit delete (user
|
||||
# activated), or we're deleting because we have newer
|
||||
# images. We're about to start deleting files, so
|
||||
@ -536,16 +544,10 @@ class CleanupWorker(BaseWorker):
|
||||
build.state = zk.DELETING
|
||||
self._zk.storeBuild(image, build, build.id)
|
||||
|
||||
self._deleteLocalBuild(image, build)
|
||||
|
||||
if not uploads:
|
||||
if is_local:
|
||||
# Only remove the db build record from the builder that
|
||||
# built this image. This prevents us from removing the db
|
||||
# record without first removing the local image files.
|
||||
if not self._zk.deleteBuild(image, build.id):
|
||||
self.log.error("Unable to delete build %s because"
|
||||
" uploads still remain.", build)
|
||||
if not self._zk.deleteBuild(image, build.id):
|
||||
self.log.error("Unable to delete build %s because"
|
||||
" uploads still remain.", build)
|
||||
|
||||
def run(self):
|
||||
'''
|
||||
@ -729,7 +731,7 @@ class BuildWorker(BaseWorker):
|
||||
bnum, diskimage.name)
|
||||
data.id = bnum
|
||||
CleanupWorker.deleteLocalBuild(
|
||||
self._config.images_dir, diskimage.name, data, self.log)
|
||||
self._config.images_dir, diskimage.name, data.id, self.log)
|
||||
data.state = zk.FAILED
|
||||
return data
|
||||
|
||||
|
@ -257,6 +257,46 @@ class TestNodepoolCMD(tests.DBTestCase):
|
||||
self.assert_listed(
|
||||
configfile, ['dib-image-list'], 0, 'fake-image-0000000001', 0)
|
||||
|
||||
def test_dib_image_delete_two_builders(self):
|
||||
# This tests deleting an image when its builder is offline
|
||||
# 1. Build the image with builder1
|
||||
configfile1 = self.setup_config('node.yaml')
|
||||
builder1 = self.useBuilder(configfile1)
|
||||
self.waitForImage('fake-provider', 'fake-image')
|
||||
# Check the image exists
|
||||
builds = self.zk.getMostRecentBuilds(1, 'fake-image', zk.READY)
|
||||
|
||||
# 2. Stop builder1; start builder2
|
||||
for worker in builder1._upload_workers:
|
||||
worker.shutdown()
|
||||
worker.join()
|
||||
builder1.stop()
|
||||
# setup_config() makes a new images_dir each time, so this
|
||||
# acts as a different builder.
|
||||
configfile2 = self.setup_config('node.yaml')
|
||||
builder2 = self.useBuilder(configfile2)
|
||||
|
||||
# 3. Set image to 'deleted' in ZK
|
||||
self.patch_argv('-c', configfile1, 'dib-image-delete',
|
||||
'fake-image-%s' % (builds[0].id))
|
||||
nodepoolcmd.main()
|
||||
|
||||
# 4. Verify builder2 deleted the ZK records, but image is still on disk
|
||||
self.waitForBuildDeletion('fake-image', '0000000001')
|
||||
self.assertTrue(
|
||||
os.path.exists(os.path.join(builder1._config.images_dir,
|
||||
'fake-image-0000000001.d')))
|
||||
self.assertFalse(
|
||||
os.path.exists(os.path.join(builder2._config.images_dir,
|
||||
'fake-image-0000000001.d')))
|
||||
|
||||
# 5. Start builder1 and verify it deletes image on disk
|
||||
builder1 = self.useBuilder(configfile1)
|
||||
for _ in iterate_timeout(30, AssertionError, 'image file delete'):
|
||||
if not os.path.exists(os.path.join(builder1._config.images_dir,
|
||||
'fake-image-0000000001.d')):
|
||||
break
|
||||
|
||||
def test_delete(self):
|
||||
configfile = self.setup_config('node.yaml')
|
||||
pool = self.useNodepool(configfile, watermark_sleep=1)
|
||||
|
Loading…
x
Reference in New Issue
Block a user