Continue trying uploads for other providers after failure

In order to avoid spending hours or days uploading images with a
stale configuration, we exit the upload loop after each successful
upload, refresh our config, and then resume where we left off.

But this assumes that uploads eventually succeed; otherwise if we
attempt an upload and it fails, we will still restart the loop and
therefore re-attempt the same upload.

To avoid this situation, only exit the loop when an upload succeeds.

Change-Id: Ib4422382a0d2abbfa55c83922b68c1b4df482dcb
This commit is contained in:
James E. Blair 2024-06-18 10:50:33 -07:00
parent 4d05fd142d
commit 798fd52733
2 changed files with 30 additions and 6 deletions

View File

@ -1335,7 +1335,8 @@ class UploadWorker(BaseWorker):
exception handling can treat all provider-image uploads
indepedently.
:returns: True if an upload was attempted, False otherwise.
:returns: True if an upload was attempted and succeeded,
False otherwise.
'''
self._image_status.setdefault(image.name, {})
@ -1411,7 +1412,13 @@ class UploadWorker(BaseWorker):
# Set final state
self._zk.storeImageUpload(image.name, build.id,
provider.name, data, upnum)
return True
if data.state == zk.READY:
return True
# If we return true after an error, we will get stuck
# in a loop where we retry this image repeatedly at
# the expense of other providers, so even if we tried,
# if we failed, return False.
return False
except exceptions.ZKLockException:
# Lock is already held. Skip it.
return False

View File

@ -183,7 +183,7 @@ class TestNodePoolBuilder(tests.DBTestCase):
fakeadapter.FakeAdapter, '_getClient',
get_fake_client))
configfile = self.setup_config('node.yaml')
configfile = self.setup_config('node_two_provider.yaml')
# NOTE(pabelanger): Disable CleanupWorker thread for nodepool-builder
# as we currently race it to validate our failed uploads.
self.useBuilder(configfile, cleanup_interval=0)
@ -197,9 +197,26 @@ class TestNodePoolBuilder(tests.DBTestCase):
state=zk.READY)
self.assertEqual(1, len(newest_builds))
uploads = self.zk.getUploads('fake-image', newest_builds[0].id,
'fake-provider', states=[zk.FAILED])
self.assertEqual(1, len(uploads))
# Assert that it failed once and succeeded once for fake-provider
uploads1 = self.zk.getUploads('fake-image', newest_builds[0].id,
'fake-provider', states=[zk.FAILED])
self.assertEqual(1, len(uploads1))
uploads2 = self.zk.getUploads('fake-image', newest_builds[0].id,
'fake-provider', states=[zk.READY])
self.assertEqual(1, len(uploads2))
# Assert that it never failed and succeeded once for fake-provider2
uploads3 = self.zk.getUploads('fake-image', newest_builds[0].id,
'fake-provider2', states=[zk.FAILED])
self.assertEqual(0, len(uploads3))
uploads4 = self.zk.getUploads('fake-image', newest_builds[0].id,
'fake-provider2', states=[zk.READY])
self.assertEqual(1, len(uploads4))
# Assert that we performed the upload to the second provider
# before we retried the first one (this verifies that we
# continue working with other providers after a failure).
self.assertTrue(uploads4[0].state_time < uploads2[0].state_time)
def test_provider_addition(self):
configfile = self.setup_config('node.yaml')