sharder: stall cleaving at shard range gaps

Previously the sharder cleaving process would skip over gaps in shard
ranges. Gaps are not normally expected, but could occur if, for
example, multiple inconsistent decisions are made to configure shards
for shrinking, resulting in a shrinking shard having insufficient
acceptor shard to cover its namespace. In these circumstances the
shrinking shard's cleaving process should stall when it encounters a
gap in the acceptors. This is achieved by always checking that the
lower bound of the next shard range to cleave is less than or equal to
the current cleaving cursor. Cleaving will resume when a suitable
acceptor becomes available to cover the namespace gap.

Change-Id: I1046a5cf809d2a905ede5e1f285939c91843074d
This commit is contained in:
Alistair Coles 2021-02-03 21:38:34 +00:00
parent 29418998b7
commit ed6586c460
2 changed files with 11 additions and 5 deletions

View File

@ -1706,6 +1706,8 @@ class ContainerSharder(ContainerReplicator):
quote(broker.path))
else:
cleaving_context.start()
own_shard_range = broker.get_own_shard_range()
cleaving_context.cursor = own_shard_range.lower_str
cleaving_context.ranges_todo = len(ranges_todo)
self.logger.debug('Starting to cleave (%s todo): %s',
cleaving_context.ranges_todo, quote(broker.path))
@ -1722,6 +1724,11 @@ class ContainerSharder(ContainerReplicator):
if len(ranges_done) == self.cleave_batch_size:
break
if shard_range.lower > cleaving_context.cursor:
self.logger.info('Stopped cleave at gap: %r - %r' %
(cleaving_context.cursor, shard_range.lower))
break
if shard_range.state not in (ShardRange.CREATED,
ShardRange.CLEAVED,
ShardRange.ACTIVE):

View File

@ -1878,14 +1878,13 @@ class TestSharder(BaseTestSharder):
do_test(ShardRange.CLEAVED, (('d', 'k'), ('k', 't')), expect_delete)
# shrinking to incomplete acceptors, gap at start and end of namespace
do_test(ShardRange.CREATED, (('k', 't'),), expect_delete,
exp_progress_bounds=(('k', 't'),))
exp_progress_bounds=())
# shrinking to incomplete acceptors, gap at start of namespace
expect_delete = True
do_test(ShardRange.CLEAVED, (('k', 't'), ('t', '')), expect_delete,
exp_progress_bounds=(('k', 't'), ('t', '')))
# shrinking to incomplete acceptors, gap in middle
exp_progress_bounds=())
# shrinking to incomplete acceptors, gap in middle - some progress
do_test(ShardRange.CLEAVED, (('d', 'k'), ('t', '')), expect_delete,
exp_progress_bounds=(('d', 'k'), ('t', '')))
exp_progress_bounds=(('d', 'k'),))
def test_cleave_repeated(self):
# verify that if new objects are merged into retiring db after cleaving