Sharding: root audit epoch reset warning

We've seen an epoch get reset on a root container in production. I
imagine this could happen if get_own_range is called allowing the self
generated default being called on a new primary before replication has
moved the old db across.

This patch doesn't fix the issue, but adds a root audit reset epoch
check so we can log when/if it happens. So we can 1. see when it happens
and how systemic it is. and 2. if it needs to be fixed automatically or
something that can be done on an ad-hoc basis.

Change-Id: Ia7584ea35600df67dae1cf3694e0ef62612931fa
This commit is contained in:
Matthew Oliver 2021-07-06 12:13:37 +10:00
parent 2117a32b99
commit da4010ae2b
2 changed files with 46 additions and 1 deletions

View File

@ -1031,6 +1031,14 @@ class ContainerSharder(ContainerSharderConf, ContainerReplicator):
'overlapping ranges in state %r: %s' %
(ShardRange.STATES[state], all_overlaps))
# We've seen a case in production where the roots own_shard_range
# epoch is reset to None, and state set to ACTIVE (like re-defaulted)
# Epoch it important to sharding so we want to detect if this happens
# 1. So we can alert, and 2. to see how common it is.
if own_shard_range.epoch is None and broker.db_epoch:
warnings.append('own_shard_range reset to None should be %s'
% broker.db_epoch)
if warnings:
self.logger.warning(
'Audit failed for root %s (%s): %s',

View File

@ -72,7 +72,7 @@ class BaseTestSharder(unittest.TestCase):
datadir = os.path.join(
self.tempdir, device, 'containers', str(part), hash_[-3:], hash_)
if epoch:
filename = '%s_%s.db' % (hash, epoch)
filename = '%s_%s.db' % (hash_, epoch)
else:
filename = hash_ + '.db'
db_file = os.path.join(datadir, filename)
@ -4914,6 +4914,43 @@ class TestSharder(BaseTestSharder):
with annotate_failure(state):
check_all_shard_ranges_sent(state)
def test_audit_root_container_reset_epoch(self):
epoch = next(self.ts_iter)
broker = self._make_broker(epoch=epoch.normal)
shard_bounds = (('', 'j'), ('j', 'k'), ('k', 's'),
('s', 'y'), ('y', ''))
shard_ranges = self._make_shard_ranges(shard_bounds,
ShardRange.ACTIVE,
timestamp=next(self.ts_iter))
broker.merge_shard_ranges(shard_ranges)
own_shard_range = broker.get_own_shard_range()
own_shard_range.update_state(ShardRange.SHARDED, next(self.ts_iter))
own_shard_range.epoch = epoch
broker.merge_shard_ranges(own_shard_range)
with self._mock_sharder() as sharder:
with mock.patch.object(
sharder, '_audit_shard_container') as mocked:
sharder._audit_container(broker)
self.assertFalse(sharder.logger.get_lines_for_level('warning'))
self.assertFalse(sharder.logger.get_lines_for_level('error'))
self._assert_stats({'attempted': 1, 'success': 1, 'failure': 0},
sharder, 'audit_root')
mocked.assert_not_called()
# test for a reset epoch
own_shard_range = broker.get_own_shard_range()
own_shard_range.epoch = None
own_shard_range.state_timestamp = next(self.ts_iter)
broker.merge_shard_ranges(own_shard_range)
with self._mock_sharder() as sharder:
with mock.patch.object(
sharder, '_audit_shard_container') as mocked:
sharder._audit_container(broker)
lines = sharder.logger.get_lines_for_level('warning')
self.assertIn("own_shard_range reset to None should be %s"
% broker.db_epoch, lines[0])
def test_audit_root_container(self):
broker = self._make_broker()