Address a sharder/replicator race

In the previous patch, we could clean up all container DBs, but only if
the daemons went in a specific order (which cannot be guaranteed in a
production system).

Once a reclaim age passes, there's a race: If the container-replicator
processes the root container before the container-sharder processes the
shards, the deleted shards would get reaped from the root so they won't
be available for the sharder. The shard containers then hang around
indefinitely.

Now, be willing to mark shard DBs as deleted even when we can't find our
own shard range in the root. Fortunately, the shard already knows that
its range has been deleted; we don't need to get that info from the root.

Change-Id: If08bccf753490157f27c95b4038f3dd33d3d7f8c
Related-Change: Icba98f1c9e17e8ade3f0e1b9a23360cf5ab8c86b
This commit is contained in:
Tim Burke 2020-06-25 14:08:34 -07:00
parent a4cf508a38
commit 36bd21488e
2 changed files with 13 additions and 9 deletions

View File

@ -746,8 +746,9 @@ class ContainerSharder(ContainerReplicator):
# root may not yet know about this shard container
warnings.append('root has no matching shard range')
shard_range = None
else:
elif not own_shard_range.deleted:
warnings.append('unable to get shard ranges from root')
# else, our shard range is deleted, so root may have reclaimed it
else:
errors.append('missing own shard range')
@ -767,14 +768,16 @@ class ContainerSharder(ContainerReplicator):
self.logger.debug('Updating shard from root %s', dict(shard_range))
broker.merge_shard_ranges(shard_range)
own_shard_range = broker.get_own_shard_range()
delete_age = time.time() - self.reclaim_age
if (own_shard_range.state == ShardRange.SHARDED and
own_shard_range.deleted and
own_shard_range.timestamp < delete_age and
broker.empty()):
broker.delete_db(Timestamp.now().internal)
self.logger.debug('Deleted shard container %s (%s)',
broker.db_file, quote(broker.path))
delete_age = time.time() - self.reclaim_age
if (own_shard_range.state == ShardRange.SHARDED and
own_shard_range.deleted and
own_shard_range.timestamp < delete_age and
broker.empty()):
broker.delete_db(Timestamp.now().internal)
self.logger.debug('Deleted shard container %s (%s)',
broker.db_file, quote(broker.path))
self._increment_stat('audit_shard', 'success', statsd=True)
return True

View File

@ -4626,6 +4626,7 @@ class TestSharder(BaseTestSharder):
own_shard_range.deleted = 1
own_shard_range.timestamp = Timestamp.now()
broker.merge_shard_ranges([own_shard_range])
del shard_ranges[:] # root responds with no shard ranges
assert_ok()
self.assertTrue(broker.is_deleted())