From 36bd21488e2b1c2fce667412c05ca439c3a17cde Mon Sep 17 00:00:00 2001 From: Tim Burke Date: Thu, 25 Jun 2020 14:08:34 -0700 Subject: [PATCH] Address a sharder/replicator race In the previous patch, we could clean up all container DBs, but only if the daemons went in a specific order (which cannot be guaranteed in a production system). Once a reclaim age passes, there's a race: If the container-replicator processes the root container before the container-sharder processes the shards, the deleted shards would get reaped from the root so they won't be available for the sharder. The shard containers then hang around indefinitely. Now, be willing to mark shard DBs as deleted even when we can't find our own shard range in the root. Fortunately, the shard already knows that its range has been deleted; we don't need to get that info from the root. Change-Id: If08bccf753490157f27c95b4038f3dd33d3d7f8c Related-Change: Icba98f1c9e17e8ade3f0e1b9a23360cf5ab8c86b --- swift/container/sharder.py | 21 ++++++++++++--------- test/unit/container/test_sharder.py | 1 + 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/swift/container/sharder.py b/swift/container/sharder.py index dd33043ae4..a942992029 100644 --- a/swift/container/sharder.py +++ b/swift/container/sharder.py @@ -746,8 +746,9 @@ class ContainerSharder(ContainerReplicator): # root may not yet know about this shard container warnings.append('root has no matching shard range') shard_range = None - else: + elif not own_shard_range.deleted: warnings.append('unable to get shard ranges from root') + # else, our shard range is deleted, so root may have reclaimed it else: errors.append('missing own shard range') @@ -767,14 +768,16 @@ class ContainerSharder(ContainerReplicator): self.logger.debug('Updating shard from root %s', dict(shard_range)) broker.merge_shard_ranges(shard_range) own_shard_range = broker.get_own_shard_range() - delete_age = time.time() - self.reclaim_age - if (own_shard_range.state == ShardRange.SHARDED and - own_shard_range.deleted and - own_shard_range.timestamp < delete_age and - broker.empty()): - broker.delete_db(Timestamp.now().internal) - self.logger.debug('Deleted shard container %s (%s)', - broker.db_file, quote(broker.path)) + + delete_age = time.time() - self.reclaim_age + if (own_shard_range.state == ShardRange.SHARDED and + own_shard_range.deleted and + own_shard_range.timestamp < delete_age and + broker.empty()): + broker.delete_db(Timestamp.now().internal) + self.logger.debug('Deleted shard container %s (%s)', + broker.db_file, quote(broker.path)) + self._increment_stat('audit_shard', 'success', statsd=True) return True diff --git a/test/unit/container/test_sharder.py b/test/unit/container/test_sharder.py index a54ddb652b..02485b79be 100644 --- a/test/unit/container/test_sharder.py +++ b/test/unit/container/test_sharder.py @@ -4626,6 +4626,7 @@ class TestSharder(BaseTestSharder): own_shard_range.deleted = 1 own_shard_range.timestamp = Timestamp.now() broker.merge_shard_ranges([own_shard_range]) + del shard_ranges[:] # root responds with no shard ranges assert_ok() self.assertTrue(broker.is_deleted())