Sharding: root audit epoch reset warning
We've seen an epoch get reset on a root container in production. I imagine this could happen if get_own_range is called allowing the self generated default being called on a new primary before replication has moved the old db across. This patch doesn't fix the issue, but adds a root audit reset epoch check so we can log when/if it happens. So we can 1. see when it happens and how systemic it is. and 2. if it needs to be fixed automatically or something that can be done on an ad-hoc basis. Change-Id: Ia7584ea35600df67dae1cf3694e0ef62612931fa
This commit is contained in:
parent
2117a32b99
commit
da4010ae2b
@ -1031,6 +1031,14 @@ class ContainerSharder(ContainerSharderConf, ContainerReplicator):
|
||||
'overlapping ranges in state %r: %s' %
|
||||
(ShardRange.STATES[state], all_overlaps))
|
||||
|
||||
# We've seen a case in production where the roots own_shard_range
|
||||
# epoch is reset to None, and state set to ACTIVE (like re-defaulted)
|
||||
# Epoch it important to sharding so we want to detect if this happens
|
||||
# 1. So we can alert, and 2. to see how common it is.
|
||||
if own_shard_range.epoch is None and broker.db_epoch:
|
||||
warnings.append('own_shard_range reset to None should be %s'
|
||||
% broker.db_epoch)
|
||||
|
||||
if warnings:
|
||||
self.logger.warning(
|
||||
'Audit failed for root %s (%s): %s',
|
||||
|
@ -72,7 +72,7 @@ class BaseTestSharder(unittest.TestCase):
|
||||
datadir = os.path.join(
|
||||
self.tempdir, device, 'containers', str(part), hash_[-3:], hash_)
|
||||
if epoch:
|
||||
filename = '%s_%s.db' % (hash, epoch)
|
||||
filename = '%s_%s.db' % (hash_, epoch)
|
||||
else:
|
||||
filename = hash_ + '.db'
|
||||
db_file = os.path.join(datadir, filename)
|
||||
@ -4914,6 +4914,43 @@ class TestSharder(BaseTestSharder):
|
||||
with annotate_failure(state):
|
||||
check_all_shard_ranges_sent(state)
|
||||
|
||||
def test_audit_root_container_reset_epoch(self):
|
||||
epoch = next(self.ts_iter)
|
||||
broker = self._make_broker(epoch=epoch.normal)
|
||||
shard_bounds = (('', 'j'), ('j', 'k'), ('k', 's'),
|
||||
('s', 'y'), ('y', ''))
|
||||
shard_ranges = self._make_shard_ranges(shard_bounds,
|
||||
ShardRange.ACTIVE,
|
||||
timestamp=next(self.ts_iter))
|
||||
broker.merge_shard_ranges(shard_ranges)
|
||||
own_shard_range = broker.get_own_shard_range()
|
||||
own_shard_range.update_state(ShardRange.SHARDED, next(self.ts_iter))
|
||||
own_shard_range.epoch = epoch
|
||||
broker.merge_shard_ranges(own_shard_range)
|
||||
with self._mock_sharder() as sharder:
|
||||
with mock.patch.object(
|
||||
sharder, '_audit_shard_container') as mocked:
|
||||
sharder._audit_container(broker)
|
||||
self.assertFalse(sharder.logger.get_lines_for_level('warning'))
|
||||
self.assertFalse(sharder.logger.get_lines_for_level('error'))
|
||||
self._assert_stats({'attempted': 1, 'success': 1, 'failure': 0},
|
||||
sharder, 'audit_root')
|
||||
mocked.assert_not_called()
|
||||
|
||||
# test for a reset epoch
|
||||
own_shard_range = broker.get_own_shard_range()
|
||||
own_shard_range.epoch = None
|
||||
own_shard_range.state_timestamp = next(self.ts_iter)
|
||||
broker.merge_shard_ranges(own_shard_range)
|
||||
with self._mock_sharder() as sharder:
|
||||
with mock.patch.object(
|
||||
sharder, '_audit_shard_container') as mocked:
|
||||
sharder._audit_container(broker)
|
||||
lines = sharder.logger.get_lines_for_level('warning')
|
||||
|
||||
self.assertIn("own_shard_range reset to None should be %s"
|
||||
% broker.db_epoch, lines[0])
|
||||
|
||||
def test_audit_root_container(self):
|
||||
broker = self._make_broker()
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user