From da4010ae2b52d762a9a69ee4bd68e94298e0ac6a Mon Sep 17 00:00:00 2001 From: Matthew Oliver Date: Tue, 6 Jul 2021 12:13:37 +1000 Subject: [PATCH] Sharding: root audit epoch reset warning We've seen an epoch get reset on a root container in production. I imagine this could happen if get_own_range is called allowing the self generated default being called on a new primary before replication has moved the old db across. This patch doesn't fix the issue, but adds a root audit reset epoch check so we can log when/if it happens. So we can 1. see when it happens and how systemic it is. and 2. if it needs to be fixed automatically or something that can be done on an ad-hoc basis. Change-Id: Ia7584ea35600df67dae1cf3694e0ef62612931fa --- swift/container/sharder.py | 8 ++++++ test/unit/container/test_sharder.py | 39 ++++++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/swift/container/sharder.py b/swift/container/sharder.py index 4a243deabd..7a39f17c49 100644 --- a/swift/container/sharder.py +++ b/swift/container/sharder.py @@ -1031,6 +1031,14 @@ class ContainerSharder(ContainerSharderConf, ContainerReplicator): 'overlapping ranges in state %r: %s' % (ShardRange.STATES[state], all_overlaps)) + # We've seen a case in production where the roots own_shard_range + # epoch is reset to None, and state set to ACTIVE (like re-defaulted) + # Epoch it important to sharding so we want to detect if this happens + # 1. So we can alert, and 2. to see how common it is. + if own_shard_range.epoch is None and broker.db_epoch: + warnings.append('own_shard_range reset to None should be %s' + % broker.db_epoch) + if warnings: self.logger.warning( 'Audit failed for root %s (%s): %s', diff --git a/test/unit/container/test_sharder.py b/test/unit/container/test_sharder.py index 17120c2b57..4026f79001 100644 --- a/test/unit/container/test_sharder.py +++ b/test/unit/container/test_sharder.py @@ -72,7 +72,7 @@ class BaseTestSharder(unittest.TestCase): datadir = os.path.join( self.tempdir, device, 'containers', str(part), hash_[-3:], hash_) if epoch: - filename = '%s_%s.db' % (hash, epoch) + filename = '%s_%s.db' % (hash_, epoch) else: filename = hash_ + '.db' db_file = os.path.join(datadir, filename) @@ -4914,6 +4914,43 @@ class TestSharder(BaseTestSharder): with annotate_failure(state): check_all_shard_ranges_sent(state) + def test_audit_root_container_reset_epoch(self): + epoch = next(self.ts_iter) + broker = self._make_broker(epoch=epoch.normal) + shard_bounds = (('', 'j'), ('j', 'k'), ('k', 's'), + ('s', 'y'), ('y', '')) + shard_ranges = self._make_shard_ranges(shard_bounds, + ShardRange.ACTIVE, + timestamp=next(self.ts_iter)) + broker.merge_shard_ranges(shard_ranges) + own_shard_range = broker.get_own_shard_range() + own_shard_range.update_state(ShardRange.SHARDED, next(self.ts_iter)) + own_shard_range.epoch = epoch + broker.merge_shard_ranges(own_shard_range) + with self._mock_sharder() as sharder: + with mock.patch.object( + sharder, '_audit_shard_container') as mocked: + sharder._audit_container(broker) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + self._assert_stats({'attempted': 1, 'success': 1, 'failure': 0}, + sharder, 'audit_root') + mocked.assert_not_called() + + # test for a reset epoch + own_shard_range = broker.get_own_shard_range() + own_shard_range.epoch = None + own_shard_range.state_timestamp = next(self.ts_iter) + broker.merge_shard_ranges(own_shard_range) + with self._mock_sharder() as sharder: + with mock.patch.object( + sharder, '_audit_shard_container') as mocked: + sharder._audit_container(broker) + lines = sharder.logger.get_lines_for_level('warning') + + self.assertIn("own_shard_range reset to None should be %s" + % broker.db_epoch, lines[0]) + def test_audit_root_container(self): broker = self._make_broker()