Support deleting DIB images while builder is offline
We don't currently support deleting a DIB image while the builder that built it is offline. The reason for this is to ensure that we actually remove the files from disk on the builder. The mechanism is for all other builders to defer handling "DELETING" image nodes in ZK to the builder which built them. This can be problematic if the builder is offline for an extended period, or permanently. To address this case without compromising the original goal, we now let any builder delete the uploads and ZK nodes for a DIB build. Subsequently, every builder will now look for DIB manifest directories within its image-dir, and if it finds one that does not have a corresponding ZK node, it garbage collects that image from disk. Change-Id: I65efb31ca02cea4bcf7ef8f962a00b5263ccf69c
This commit is contained in:
parent
d01f19d762
commit
6386170914
@ -15,6 +15,7 @@
|
|||||||
import fcntl
|
import fcntl
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import select
|
import select
|
||||||
import shutil
|
import shutil
|
||||||
import socket
|
import socket
|
||||||
@ -226,7 +227,7 @@ class CleanupWorker(BaseWorker):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def deleteLocalBuild(images_dir, image, build, log):
|
def deleteLocalBuild(images_dir, image_name, build_id, log):
|
||||||
'''
|
'''
|
||||||
Remove expired image build from local disk.
|
Remove expired image build from local disk.
|
||||||
|
|
||||||
@ -247,12 +248,12 @@ class CleanupWorker(BaseWorker):
|
|||||||
if e.errno != 2: # No such file or directory
|
if e.errno != 2: # No such file or directory
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
base = "-".join([image, build.id])
|
base = "-".join([image_name, build_id])
|
||||||
files = DibImageFile.from_image_id(images_dir, base)
|
files = DibImageFile.from_image_id(images_dir, base)
|
||||||
if not files:
|
if not files:
|
||||||
return
|
return
|
||||||
|
|
||||||
log.info("Doing cleanup for %s:%s" % (image, build.id))
|
log.info("Doing cleanup for %s:%s" % (image_name, build_id))
|
||||||
|
|
||||||
manifest_dir = None
|
manifest_dir = None
|
||||||
|
|
||||||
@ -271,9 +272,9 @@ class CleanupWorker(BaseWorker):
|
|||||||
if e.errno != 2: # No such file or directory
|
if e.errno != 2: # No such file or directory
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
def _deleteLocalBuild(self, image, build):
|
def _deleteLocalBuild(self, image_name, build_id):
|
||||||
CleanupWorker.deleteLocalBuild(
|
CleanupWorker.deleteLocalBuild(
|
||||||
self._config.images_dir, image, build, self.log)
|
self._config.images_dir, image_name, build_id, self.log)
|
||||||
|
|
||||||
def _cleanupProvider(self, provider, image, build_id):
|
def _cleanupProvider(self, provider, image, build_id):
|
||||||
all_uploads = self._zk.getUploads(image, build_id, provider.name)
|
all_uploads = self._zk.getUploads(image, build_id, provider.name)
|
||||||
@ -377,6 +378,27 @@ class CleanupWorker(BaseWorker):
|
|||||||
except Exception:
|
except Exception:
|
||||||
self.log.exception("Exception cleaning up image %s:", image)
|
self.log.exception("Exception cleaning up image %s:", image)
|
||||||
|
|
||||||
|
for local_build in self._getLocalBuilds():
|
||||||
|
try:
|
||||||
|
image_name, build_id = local_build
|
||||||
|
build = self._zk.getBuild(image_name, build_id)
|
||||||
|
if not build or build.state == zk.DELETING:
|
||||||
|
self.log.info("Deleting on-disk build: "
|
||||||
|
"%s-%s", image_name, build_id)
|
||||||
|
self._deleteLocalBuild(image_name, build_id)
|
||||||
|
except Exception:
|
||||||
|
self.log.exception("Exception cleaning up local build %s:",
|
||||||
|
local_build)
|
||||||
|
|
||||||
|
MANIFEST_RE = re.compile(r'(.*)-(\d+)\.d')
|
||||||
|
|
||||||
|
def _getLocalBuilds(self):
|
||||||
|
for entry in os.scandir(self._config.images_dir):
|
||||||
|
m = self.MANIFEST_RE.match(entry.name)
|
||||||
|
if m and entry.is_dir():
|
||||||
|
image_name, build_id = m.groups()
|
||||||
|
yield (image_name, build_id)
|
||||||
|
|
||||||
def _emitBuildRequestStats(self):
|
def _emitBuildRequestStats(self):
|
||||||
'''Emit general build request stats
|
'''Emit general build request stats
|
||||||
|
|
||||||
@ -395,15 +417,6 @@ class CleanupWorker(BaseWorker):
|
|||||||
pipeline.gauge(key, count)
|
pipeline.gauge(key, count)
|
||||||
pipeline.send()
|
pipeline.send()
|
||||||
|
|
||||||
def _filterLocalBuilds(self, image, builds):
|
|
||||||
'''Return the subset of builds that are local'''
|
|
||||||
ret = []
|
|
||||||
for build in builds:
|
|
||||||
is_local = build.builder_id == self._builder_id
|
|
||||||
if is_local:
|
|
||||||
ret.append(build)
|
|
||||||
return ret
|
|
||||||
|
|
||||||
def _cleanupCurrentProviderUploads(self, provider, image, build_id):
|
def _cleanupCurrentProviderUploads(self, provider, image, build_id):
|
||||||
'''
|
'''
|
||||||
Remove cruft from a current build.
|
Remove cruft from a current build.
|
||||||
@ -453,21 +466,16 @@ class CleanupWorker(BaseWorker):
|
|||||||
# Get the list of all builds, then work from that so that we
|
# Get the list of all builds, then work from that so that we
|
||||||
# have a consistent view of the data.
|
# have a consistent view of the data.
|
||||||
all_builds = self._zk.getBuilds(image)
|
all_builds = self._zk.getBuilds(image)
|
||||||
builds_to_keep = set([b for b in sorted(all_builds, reverse=True,
|
|
||||||
key=lambda y: y.state_time)
|
|
||||||
if b.state == zk.READY][:2])
|
|
||||||
local_builds = set(self._filterLocalBuilds(image, all_builds))
|
|
||||||
diskimage = self._config.diskimages.get(image)
|
diskimage = self._config.diskimages.get(image)
|
||||||
if not diskimage and not local_builds:
|
|
||||||
# This builder is not configured to build this image and was not
|
|
||||||
# responsible for this image build so ignore it.
|
|
||||||
return
|
|
||||||
# Remove any local builds that are not in use.
|
|
||||||
if not diskimage or (diskimage and not diskimage.image_types):
|
if not diskimage or (diskimage and not diskimage.image_types):
|
||||||
builds_to_keep -= local_builds
|
builds_to_keep = set()
|
||||||
# TODO(jeblair): When all builds for an image which is not
|
# TODO(jeblair): When all builds for an image which is not
|
||||||
# in use are deleted, the image znode should be deleted as
|
# in use are deleted, the image znode should be deleted as
|
||||||
# well.
|
# well.
|
||||||
|
else:
|
||||||
|
builds_to_keep = set([b for b in sorted(all_builds, reverse=True,
|
||||||
|
key=lambda y: y.state_time)
|
||||||
|
if b.state == zk.READY][:2])
|
||||||
|
|
||||||
for build in all_builds:
|
for build in all_builds:
|
||||||
# Start by deleting any uploads that are no longer needed
|
# Start by deleting any uploads that are no longer needed
|
||||||
@ -518,13 +526,13 @@ class CleanupWorker(BaseWorker):
|
|||||||
for p in self._zk.getBuildProviders(image, build.id):
|
for p in self._zk.getBuildProviders(image, build.id):
|
||||||
uploads += self._zk.getUploads(image, build.id, p)
|
uploads += self._zk.getUploads(image, build.id, p)
|
||||||
|
|
||||||
# If we own this build, delete the local DIB files as soon as all
|
# Mark the image as deleting (which will permit deleting
|
||||||
# provider uploads are in a deleting state. This prevents us from
|
# local DIB files) as soon as all provider uploads are in
|
||||||
# keeping local files around while we wait on clouds to remove
|
# a deleting state. This prevents us from keeping local
|
||||||
# the image on their side (which can be very slow).
|
# files around while we wait on clouds to remove the image
|
||||||
|
# on their side (which can be very slow).
|
||||||
all_deleting = all(map(lambda x: x.state == zk.DELETING, uploads))
|
all_deleting = all(map(lambda x: x.state == zk.DELETING, uploads))
|
||||||
is_local = build.builder_id == self._builder_id
|
if (not uploads or all_deleting):
|
||||||
if (not uploads or all_deleting) and is_local:
|
|
||||||
# If we got here, it's either an explicit delete (user
|
# If we got here, it's either an explicit delete (user
|
||||||
# activated), or we're deleting because we have newer
|
# activated), or we're deleting because we have newer
|
||||||
# images. We're about to start deleting files, so
|
# images. We're about to start deleting files, so
|
||||||
@ -536,16 +544,10 @@ class CleanupWorker(BaseWorker):
|
|||||||
build.state = zk.DELETING
|
build.state = zk.DELETING
|
||||||
self._zk.storeBuild(image, build, build.id)
|
self._zk.storeBuild(image, build, build.id)
|
||||||
|
|
||||||
self._deleteLocalBuild(image, build)
|
|
||||||
|
|
||||||
if not uploads:
|
if not uploads:
|
||||||
if is_local:
|
if not self._zk.deleteBuild(image, build.id):
|
||||||
# Only remove the db build record from the builder that
|
self.log.error("Unable to delete build %s because"
|
||||||
# built this image. This prevents us from removing the db
|
" uploads still remain.", build)
|
||||||
# record without first removing the local image files.
|
|
||||||
if not self._zk.deleteBuild(image, build.id):
|
|
||||||
self.log.error("Unable to delete build %s because"
|
|
||||||
" uploads still remain.", build)
|
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
'''
|
'''
|
||||||
@ -729,7 +731,7 @@ class BuildWorker(BaseWorker):
|
|||||||
bnum, diskimage.name)
|
bnum, diskimage.name)
|
||||||
data.id = bnum
|
data.id = bnum
|
||||||
CleanupWorker.deleteLocalBuild(
|
CleanupWorker.deleteLocalBuild(
|
||||||
self._config.images_dir, diskimage.name, data, self.log)
|
self._config.images_dir, diskimage.name, data.id, self.log)
|
||||||
data.state = zk.FAILED
|
data.state = zk.FAILED
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
@ -257,6 +257,46 @@ class TestNodepoolCMD(tests.DBTestCase):
|
|||||||
self.assert_listed(
|
self.assert_listed(
|
||||||
configfile, ['dib-image-list'], 0, 'fake-image-0000000001', 0)
|
configfile, ['dib-image-list'], 0, 'fake-image-0000000001', 0)
|
||||||
|
|
||||||
|
def test_dib_image_delete_two_builders(self):
|
||||||
|
# This tests deleting an image when its builder is offline
|
||||||
|
# 1. Build the image with builder1
|
||||||
|
configfile1 = self.setup_config('node.yaml')
|
||||||
|
builder1 = self.useBuilder(configfile1)
|
||||||
|
self.waitForImage('fake-provider', 'fake-image')
|
||||||
|
# Check the image exists
|
||||||
|
builds = self.zk.getMostRecentBuilds(1, 'fake-image', zk.READY)
|
||||||
|
|
||||||
|
# 2. Stop builder1; start builder2
|
||||||
|
for worker in builder1._upload_workers:
|
||||||
|
worker.shutdown()
|
||||||
|
worker.join()
|
||||||
|
builder1.stop()
|
||||||
|
# setup_config() makes a new images_dir each time, so this
|
||||||
|
# acts as a different builder.
|
||||||
|
configfile2 = self.setup_config('node.yaml')
|
||||||
|
builder2 = self.useBuilder(configfile2)
|
||||||
|
|
||||||
|
# 3. Set image to 'deleted' in ZK
|
||||||
|
self.patch_argv('-c', configfile1, 'dib-image-delete',
|
||||||
|
'fake-image-%s' % (builds[0].id))
|
||||||
|
nodepoolcmd.main()
|
||||||
|
|
||||||
|
# 4. Verify builder2 deleted the ZK records, but image is still on disk
|
||||||
|
self.waitForBuildDeletion('fake-image', '0000000001')
|
||||||
|
self.assertTrue(
|
||||||
|
os.path.exists(os.path.join(builder1._config.images_dir,
|
||||||
|
'fake-image-0000000001.d')))
|
||||||
|
self.assertFalse(
|
||||||
|
os.path.exists(os.path.join(builder2._config.images_dir,
|
||||||
|
'fake-image-0000000001.d')))
|
||||||
|
|
||||||
|
# 5. Start builder1 and verify it deletes image on disk
|
||||||
|
builder1 = self.useBuilder(configfile1)
|
||||||
|
for _ in iterate_timeout(30, AssertionError, 'image file delete'):
|
||||||
|
if not os.path.exists(os.path.join(builder1._config.images_dir,
|
||||||
|
'fake-image-0000000001.d')):
|
||||||
|
break
|
||||||
|
|
||||||
def test_delete(self):
|
def test_delete(self):
|
||||||
configfile = self.setup_config('node.yaml')
|
configfile = self.setup_config('node.yaml')
|
||||||
pool = self.useNodepool(configfile, watermark_sleep=1)
|
pool = self.useNodepool(configfile, watermark_sleep=1)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user