Sharding: root audit epoch reset warning

We've seen an epoch get reset on a root container in production. I imagine this could happen if get_own_range is called allowing the self generated default being called on a new primary before replication has moved the old db across. This patch doesn't fix the issue, but adds a root audit reset epoch check so we can log when/if it happens. So we can 1. see when it happens and how systemic it is. and 2. if it needs to be fixed automatically or something that can be done on an ad-hoc basis. Change-Id: Ia7584ea35600df67dae1cf3694e0ef62612931fa
2021-07-06 12:13:37 +10:00 · 2021-07-06 12:13:37 +10:00 · da4010ae2b
commit da4010ae2b
parent 2117a32b99
2 changed files with 46 additions and 1 deletions
--- a/swift/container/sharder.py
+++ b/swift/container/sharder.py
@ -1031,6 +1031,14 @@ class ContainerSharder(ContainerSharderConf, ContainerReplicator):
                    'overlapping ranges in state %r: %s' %
                    (ShardRange.STATES[state], all_overlaps))

+        # We've seen a case in production where the roots own_shard_range
+        # epoch is reset to None, and state set to ACTIVE (like re-defaulted)
+        # Epoch it important to sharding so we want to detect if this happens
+        # 1. So we can alert, and 2. to see how common it is.
+        if own_shard_range.epoch is None and broker.db_epoch:
+            warnings.append('own_shard_range reset to None should be %s'
+                            % broker.db_epoch)
+
        if warnings:
            self.logger.warning(
                'Audit failed for root %s (%s): %s',
--- a/test/unit/container/test_sharder.py
+++ b/test/unit/container/test_sharder.py
@ -72,7 +72,7 @@ class BaseTestSharder(unittest.TestCase):
        datadir = os.path.join(
            self.tempdir, device, 'containers', str(part), hash_[-3:], hash_)
        if epoch:
-            filename = '%s_%s.db' % (hash, epoch)
+            filename = '%s_%s.db' % (hash_, epoch)
        else:
            filename = hash_ + '.db'
        db_file = os.path.join(datadir, filename)
@ -4914,6 +4914,43 @@ class TestSharder(BaseTestSharder):
            with annotate_failure(state):
                check_all_shard_ranges_sent(state)

+    def test_audit_root_container_reset_epoch(self):
+        epoch = next(self.ts_iter)
+        broker = self._make_broker(epoch=epoch.normal)
+        shard_bounds = (('', 'j'), ('j', 'k'), ('k', 's'),
+                        ('s', 'y'), ('y', ''))
+        shard_ranges = self._make_shard_ranges(shard_bounds,
+                                               ShardRange.ACTIVE,
+                                               timestamp=next(self.ts_iter))
+        broker.merge_shard_ranges(shard_ranges)
+        own_shard_range = broker.get_own_shard_range()
+        own_shard_range.update_state(ShardRange.SHARDED, next(self.ts_iter))
+        own_shard_range.epoch = epoch
+        broker.merge_shard_ranges(own_shard_range)
+        with self._mock_sharder() as sharder:
+            with mock.patch.object(
+                    sharder, '_audit_shard_container') as mocked:
+                sharder._audit_container(broker)
+        self.assertFalse(sharder.logger.get_lines_for_level('warning'))
+        self.assertFalse(sharder.logger.get_lines_for_level('error'))
+        self._assert_stats({'attempted': 1, 'success': 1, 'failure': 0},
+                           sharder, 'audit_root')
+        mocked.assert_not_called()
+
+        # test for a reset epoch
+        own_shard_range = broker.get_own_shard_range()
+        own_shard_range.epoch = None
+        own_shard_range.state_timestamp = next(self.ts_iter)
+        broker.merge_shard_ranges(own_shard_range)
+        with self._mock_sharder() as sharder:
+            with mock.patch.object(
+                    sharder, '_audit_shard_container') as mocked:
+                sharder._audit_container(broker)
+        lines = sharder.logger.get_lines_for_level('warning')
+
+        self.assertIn("own_shard_range reset to None should be %s"
+                      % broker.db_epoch, lines[0])
+
    def test_audit_root_container(self):
        broker = self._make_broker()