From da4010ae2b52d762a9a69ee4bd68e94298e0ac6a Mon Sep 17 00:00:00 2001
From: Matthew Oliver <matt@oliver.net.au>
Date: Tue, 6 Jul 2021 12:13:37 +1000
Subject: [PATCH] Sharding: root audit epoch reset warning

We've seen an epoch get reset on a root container in production. I
imagine this could happen if get_own_range is called allowing the self
generated default being called on a new primary before replication has
moved the old db across.

This patch doesn't fix the issue, but adds a root audit reset epoch
check so we can log when/if it happens. So we can 1. see when it happens
and how systemic it is. and 2. if it needs to be fixed automatically or
something that can be done on an ad-hoc basis.

Change-Id: Ia7584ea35600df67dae1cf3694e0ef62612931fa
---
 swift/container/sharder.py          |  8 ++++++
 test/unit/container/test_sharder.py | 39 ++++++++++++++++++++++++++++-
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/swift/container/sharder.py b/swift/container/sharder.py
index 4a243deabd..7a39f17c49 100644
--- a/swift/container/sharder.py
+++ b/swift/container/sharder.py
@@ -1031,6 +1031,14 @@ class ContainerSharder(ContainerSharderConf, ContainerReplicator):
                     'overlapping ranges in state %r: %s' %
                     (ShardRange.STATES[state], all_overlaps))
 
+        # We've seen a case in production where the roots own_shard_range
+        # epoch is reset to None, and state set to ACTIVE (like re-defaulted)
+        # Epoch it important to sharding so we want to detect if this happens
+        # 1. So we can alert, and 2. to see how common it is.
+        if own_shard_range.epoch is None and broker.db_epoch:
+            warnings.append('own_shard_range reset to None should be %s'
+                            % broker.db_epoch)
+
         if warnings:
             self.logger.warning(
                 'Audit failed for root %s (%s): %s',
diff --git a/test/unit/container/test_sharder.py b/test/unit/container/test_sharder.py
index 17120c2b57..4026f79001 100644
--- a/test/unit/container/test_sharder.py
+++ b/test/unit/container/test_sharder.py
@@ -72,7 +72,7 @@ class BaseTestSharder(unittest.TestCase):
         datadir = os.path.join(
             self.tempdir, device, 'containers', str(part), hash_[-3:], hash_)
         if epoch:
-            filename = '%s_%s.db' % (hash, epoch)
+            filename = '%s_%s.db' % (hash_, epoch)
         else:
             filename = hash_ + '.db'
         db_file = os.path.join(datadir, filename)
@@ -4914,6 +4914,43 @@ class TestSharder(BaseTestSharder):
             with annotate_failure(state):
                 check_all_shard_ranges_sent(state)
 
+    def test_audit_root_container_reset_epoch(self):
+        epoch = next(self.ts_iter)
+        broker = self._make_broker(epoch=epoch.normal)
+        shard_bounds = (('', 'j'), ('j', 'k'), ('k', 's'),
+                        ('s', 'y'), ('y', ''))
+        shard_ranges = self._make_shard_ranges(shard_bounds,
+                                               ShardRange.ACTIVE,
+                                               timestamp=next(self.ts_iter))
+        broker.merge_shard_ranges(shard_ranges)
+        own_shard_range = broker.get_own_shard_range()
+        own_shard_range.update_state(ShardRange.SHARDED, next(self.ts_iter))
+        own_shard_range.epoch = epoch
+        broker.merge_shard_ranges(own_shard_range)
+        with self._mock_sharder() as sharder:
+            with mock.patch.object(
+                    sharder, '_audit_shard_container') as mocked:
+                sharder._audit_container(broker)
+        self.assertFalse(sharder.logger.get_lines_for_level('warning'))
+        self.assertFalse(sharder.logger.get_lines_for_level('error'))
+        self._assert_stats({'attempted': 1, 'success': 1, 'failure': 0},
+                           sharder, 'audit_root')
+        mocked.assert_not_called()
+
+        # test for a reset epoch
+        own_shard_range = broker.get_own_shard_range()
+        own_shard_range.epoch = None
+        own_shard_range.state_timestamp = next(self.ts_iter)
+        broker.merge_shard_ranges(own_shard_range)
+        with self._mock_sharder() as sharder:
+            with mock.patch.object(
+                    sharder, '_audit_shard_container') as mocked:
+                sharder._audit_container(broker)
+        lines = sharder.logger.get_lines_for_level('warning')
+
+        self.assertIn("own_shard_range reset to None should be %s"
+                      % broker.db_epoch, lines[0])
+
     def test_audit_root_container(self):
         broker = self._make_broker()