Merge "Fix shrinking making acceptors prematurely active"

This commit is contained in:
Zuul 2021-04-30 06:12:38 +00:00 committed by Gerrit Code Review
commit cf6095c906
4 changed files with 257 additions and 112 deletions

View File

@ -1597,12 +1597,17 @@ class ContainerSharder(ContainerReplicator):
quote(broker.path), shard_range)
replication_quorum = self.existing_shard_replication_quorum
if own_shard_range.state in (ShardRange.SHRINKING, ShardRange.SHRUNK):
if shard_range.includes(own_shard_range):
# When shrinking, include deleted own (donor) shard range in
# the replicated db so that when acceptor next updates root it
# will atomically update its namespace *and* delete the donor.
# Don't do this when sharding a shard because the donor
# namespace should not be deleted until all shards are cleaved.
# When shrinking to a single acceptor that completely encloses
# this shard's namespace, include deleted own (donor) shard
# range in the replicated db so that when acceptor next updates
# root it will atomically update its namespace *and* delete the
# donor. This reduces the chance of a temporary listing gap if
# this shard fails to update the root with its SHRUNK/deleted
# state. Don't do this when sharding a shard or shrinking to
# multiple acceptors because in those cases the donor namespace
# should not be deleted until *all* shards are cleaved.
if own_shard_range.update_state(ShardRange.SHRUNK):
own_shard_range.set_deleted()
broker.merge_shard_ranges(own_shard_range)
@ -1615,6 +1620,8 @@ class ContainerSharder(ContainerReplicator):
info = shard_broker.get_info()
shard_range.update_meta(
info['object_count'], info['bytes_used'])
# Update state to CLEAVED; only do this when sharding, not when
# shrinking
shard_range.update_state(ShardRange.CLEAVED)
shard_broker.merge_shard_ranges(shard_range)
replication_quorum = self.shard_replication_quorum
@ -1747,18 +1754,18 @@ class ContainerSharder(ContainerReplicator):
# Move all CLEAVED shards to ACTIVE state and if a shard then
# delete own shard range; these changes will be simultaneously
# reported in the next update to the root container.
own_shard_range = broker.get_own_shard_range()
own_shard_range.update_meta(0, 0)
if own_shard_range.state in (ShardRange.SHRINKING,
ShardRange.SHRUNK):
own_shard_range.update_state(ShardRange.SHRUNK)
modified_shard_ranges = []
else:
own_shard_range.update_state(ShardRange.SHARDED)
modified_shard_ranges = broker.get_shard_ranges(
states=ShardRange.CLEAVED)
for sr in modified_shard_ranges:
sr.update_state(ShardRange.ACTIVE)
own_shard_range = broker.get_own_shard_range()
if own_shard_range.state in (ShardRange.SHRINKING,
ShardRange.SHRUNK):
next_state = ShardRange.SHRUNK
else:
next_state = ShardRange.SHARDED
own_shard_range.update_state(next_state)
own_shard_range.update_meta(0, 0)
if (not broker.is_root_container() and not
own_shard_range.deleted):
own_shard_range = own_shard_range.copy(

View File

@ -2858,7 +2858,7 @@ class TestManagedContainerSharding(BaseTestContainerSharding):
def test_manage_shard_ranges_repair_root(self):
# provoke overlaps in root container and repair
obj_names = self._make_object_names(8)
obj_names = self._make_object_names(16)
self.put_objects(obj_names)
client.post_container(self.url, self.admin_token, self.container_name,
@ -2872,7 +2872,7 @@ class TestManagedContainerSharding(BaseTestContainerSharding):
self.assert_subprocess_success([
'swift-manage-shard-ranges',
self.get_db_file(self.brain.part, self.brain.nodes[0]),
'find_and_replace', '2', '--enable'])
'find_and_replace', '4', '--enable'])
shard_ranges_0 = self.assert_container_state(self.brain.nodes[0],
'unsharded', 4)
@ -2882,25 +2882,26 @@ class TestManagedContainerSharding(BaseTestContainerSharding):
self.assert_subprocess_success([
'swift-manage-shard-ranges',
self.get_db_file(self.brain.part, self.brain.nodes[1]),
'find_and_replace', '3', '--enable'])
'find_and_replace', '7', '--enable'])
shard_ranges_1 = self.assert_container_state(self.brain.nodes[1],
'unsharded', 3)
# Run sharder in specific order so that the replica with the older
# epoch_0 starts sharding first - this will prove problematic later!
# On first pass the first replica passes audit, creates shards and then
# syncs shard ranges with the other replicas. It proceeds to cleave
# shard 0.0, but after 0.0 cleaving stalls because it will now have
# shard range 1.0 in 'found' state from the other replica that it
# cannot yet cleave.
# syncs shard ranges with the other replicas, so it has a mix of 0.*
# shard ranges in CLEAVED state and 1.* ranges in FOUND state. It
# proceeds to cleave shard 0.0, but after 0.0 cleaving stalls because
# next in iteration is shard range 1.0 in FOUND state from the other
# replica that it cannot yet cleave.
self.sharders_once(number=self.brain.node_numbers[0],
additional_args='--partitions=%s' % self.brain.part)
# On first pass the second replica passes audit (it has its own found
# ranges and the first replicas created shard ranges but none in the
# ranges and the first replica's created shard ranges but none in the
# same state overlap), creates its shards and then syncs shard ranges
# with the other replicas. All of the 7 shard ranges on this replica
# are now in created state so it proceeds to cleave the first two shard
# are now in CREATED state so it proceeds to cleave the first two shard
# ranges, 0.1 and 1.0.
self.sharders_once(number=self.brain.node_numbers[1],
additional_args='--partitions=%s' % self.brain.part)
@ -2922,26 +2923,53 @@ class TestManagedContainerSharding(BaseTestContainerSharding):
# possibly cleaved during first pass before the sharding got stalled
shard_ranges = self.assert_container_state(self.brain.nodes[0],
'sharding', 7)
for sr in shard_ranges:
self.assertIn(sr.state, (ShardRange.CREATED, ShardRange.CLEAVED))
self.assertEqual([ShardRange.CLEAVED] * 2 + [ShardRange.CREATED] * 5,
[sr.state for sr in shard_ranges])
shard_ranges = self.assert_container_state(self.brain.nodes[1],
'sharding', 7)
for sr in shard_ranges:
self.assertIn(sr.state, (ShardRange.CREATED, ShardRange.CLEAVED))
self.assertEqual([ShardRange.CLEAVED] * 2 + [ShardRange.CREATED] * 5,
[sr.state for sr in shard_ranges])
# But hey, at least listings still work! They're just going to get
# horribly out of date as more objects are added
self.assert_container_listing(obj_names)
# 'swift-manage-shard-ranges repair' will choose the second set of 3
# shard ranges (1.*) with newer timestamp over the first set of 4
# (0.*), and shrink shard ranges 0.*.
# shard ranges (1.*) over the first set of 4 (0.*) because that's the
# path with most cleaving progress, and so shrink shard ranges 0.*.
db_file = self.get_db_file(self.brain.part, self.brain.nodes[0])
self.assert_subprocess_success(
['swift-manage-shard-ranges', db_file, 'repair', '--yes'])
# make sure all root replicas now sync their shard ranges
self.replicators.once()
# Run sharder on the shrinking shards. This should not change the state
# of any of the acceptors, particularly the ones that have yet to have
# object cleaved from the roots, because we don't want the as yet
# uncleaved acceptors becoming prematurely active and creating 'holes'
# in listings. The shrinking shard ranges should however get deleted in
# root container table.
self.run_sharders(shard_ranges_0)
shard_ranges = self.assert_container_state(self.brain.nodes[1],
'sharding', 3)
self.assertEqual([ShardRange.CLEAVED] * 1 + [ShardRange.CREATED] * 2,
[sr.state for sr in shard_ranges])
self.assert_container_listing(obj_names)
# check the unwanted shards did shrink away...
for shard_range in shard_ranges_0:
with annotate_failure(shard_range):
found_for_shard = self.categorize_container_dir_content(
shard_range.account, shard_range.container)
self.assertLengthEqual(found_for_shard['shard_dbs'], 3)
actual = []
for shard_db in found_for_shard['shard_dbs']:
broker = ContainerBroker(shard_db)
own_sr = broker.get_own_shard_range()
actual.append(
(broker.get_db_state(), own_sr.state, own_sr.deleted))
self.assertEqual([(SHARDED, ShardRange.SHRUNK, True)] * 3,
actual)
# At this point one of the first two replicas may have done some useful
# cleaving of 1.* shards, the other may have only cleaved 0.* shards,
# and the third replica may have cleaved no shards. We therefore need
@ -2953,7 +2981,11 @@ class TestManagedContainerSharding(BaseTestContainerSharding):
# now we expect all replicas to have just the three 1.* shards, with
# the 0.* shards all deleted
brokers = {}
orig_shard_ranges = sorted(shard_ranges_0 + shard_ranges_1,
exp_shard_ranges = sorted(
[sr.copy(state=ShardRange.SHRUNK, deleted=True)
for sr in shard_ranges_0] +
[sr.copy(state=ShardRange.ACTIVE)
for sr in shard_ranges_1],
key=ShardRange.sort_key)
for node in (0, 1, 2):
with annotate_failure('node %s' % node):
@ -2963,12 +2995,14 @@ class TestManagedContainerSharding(BaseTestContainerSharding):
shard_ranges = broker.get_shard_ranges()
self.assertEqual(shard_ranges_1, shard_ranges)
shard_ranges = broker.get_shard_ranges(include_deleted=True)
self.assertLengthEqual(shard_ranges, len(orig_shard_ranges))
self.assertEqual(orig_shard_ranges, shard_ranges)
self.assertLengthEqual(shard_ranges, len(exp_shard_ranges))
self.maxDiff = None
self.assertEqual(exp_shard_ranges, shard_ranges)
self.assertEqual(ShardRange.SHARDED,
broker._own_shard_range().state)
# Sadly, the first replica to start sharding us still reporting its db
# state to be 'unsharded' because, although it has sharded, it's shard
# Sadly, the first replica to start sharding is still reporting its db
# state to be 'unsharded' because, although it has sharded, its shard
# db epoch (epoch_0) does not match its own shard range epoch
# (epoch_1), and that is because the second replica (with epoch_1)
# updated the own shard range and replicated it to all other replicas.
@ -2985,21 +3019,6 @@ class TestManagedContainerSharding(BaseTestContainerSharding):
# not return shard ranges for listings, but has no objects, so it's
# luck of the draw whether we get a listing or not at this point :(
# check the unwanted shards did shrink away...
for shard_range in shard_ranges_0:
with annotate_failure(shard_range):
found_for_shard = self.categorize_container_dir_content(
shard_range.account, shard_range.container)
self.assertLengthEqual(found_for_shard['shard_dbs'], 3)
actual = []
for shard_db in found_for_shard['shard_dbs']:
broker = ContainerBroker(shard_db)
own_sr = broker.get_own_shard_range()
actual.append(
(broker.get_db_state(), own_sr.state, own_sr.deleted))
self.assertEqual([(SHARDED, ShardRange.SHRUNK, True)] * 3,
actual)
# Run the sharders again: the first replica that is still 'unsharded'
# because of the older epoch_0 in its db filename will now start to
# shard again with a newer epoch_1 db, and will start to re-cleave the
@ -3013,8 +3032,8 @@ class TestManagedContainerSharding(BaseTestContainerSharding):
shard_ranges = broker.get_shard_ranges()
self.assertEqual(shard_ranges_1, shard_ranges)
shard_ranges = broker.get_shard_ranges(include_deleted=True)
self.assertLengthEqual(shard_ranges, len(orig_shard_ranges))
self.assertEqual(orig_shard_ranges, shard_ranges)
self.assertLengthEqual(shard_ranges, len(exp_shard_ranges))
self.assertEqual(exp_shard_ranges, shard_ranges)
self.assertEqual(ShardRange.SHARDED,
broker._own_shard_range().state)
self.assertEqual(epoch_1, broker.db_epoch)
@ -3057,8 +3076,8 @@ class TestManagedContainerSharding(BaseTestContainerSharding):
shard_ranges = broker.get_shard_ranges()
self.assertEqual(shard_ranges_1, shard_ranges)
shard_ranges = broker.get_shard_ranges(include_deleted=True)
self.assertLengthEqual(shard_ranges, len(orig_shard_ranges))
self.assertEqual(orig_shard_ranges, shard_ranges)
self.assertLengthEqual(shard_ranges, len(exp_shard_ranges))
self.assertEqual(exp_shard_ranges, shard_ranges)
self.assertEqual(ShardRange.SHARDED,
broker._own_shard_range().state)
self.assertEqual(epoch_1, broker.db_epoch)

View File

@ -1009,6 +1009,13 @@ def mock_timestamp_now(now=None):
yield now
@contextmanager
def mock_timestamp_now_with_iter(ts_iter):
with mocklib.patch('swift.common.utils.Timestamp.now',
side_effect=ts_iter):
yield
class Timeout(object):
def __init__(self, seconds):
self.seconds = seconds

View File

@ -49,7 +49,8 @@ from test import annotate_failure
from test.debug_logger import debug_logger
from test.unit import FakeRing, make_timestamp_iter, unlink_files, \
mocked_http_conn, mock_timestamp_now, attach_fake_replication_rpc
mocked_http_conn, mock_timestamp_now, mock_timestamp_now_with_iter, \
attach_fake_replication_rpc
class BaseTestSharder(unittest.TestCase):
@ -903,9 +904,12 @@ class TestSharder(BaseTestSharder):
10, '', '', '', '', include_deleted=None, all_policies=True,
transform_func=lambda record: record)]
def _check_objects(self, expected_objs, shard_db):
def _check_objects(self, expected_objs, shard_dbs):
shard_dbs = shard_dbs if isinstance(shard_dbs, list) else [shard_dbs]
shard_objs = []
for shard_db in shard_dbs:
shard_broker = ContainerBroker(shard_db)
shard_objs = self._get_raw_object_records(shard_broker)
shard_objs.extend(self._get_raw_object_records(shard_broker))
expected_objs = [list(obj) for obj in expected_objs]
self.assertEqual(expected_objs, shard_objs)
@ -1718,62 +1722,170 @@ class TestSharder(BaseTestSharder):
self.assertFalse(os.path.exists(misplaced_dbs[1]))
def test_cleave_shard_shrinking(self):
broker = self._make_broker(account='.shards_a', container='shard_c')
unique = [0]
def do_test(acceptor_state, acceptor_bounds, expect_delete,
exp_progress_bounds=None):
# 'unique' ensures fresh dbs on each test iteration
unique[0] += 1
broker = self._make_broker(account='.shards_a',
container='donor_%s' % unique[0])
own_shard_range = ShardRange(
broker.path, next(self.ts_iter), 'here', 'where',
broker.path, next(self.ts_iter), 'h', 'w',
state=ShardRange.SHRINKING, epoch=next(self.ts_iter))
broker.merge_shard_ranges([own_shard_range])
broker.set_sharding_sysmeta('Root', 'a/c')
self.assertFalse(broker.is_root_container()) # sanity check
objects = [
('there', self.ts_encoded(), 3, 'text/plain', 'etag_there', 0, 0),
('where', self.ts_encoded(), 100, 'text/plain', 'etag_where', 0,
0),
('i', self.ts_encoded(), 3, 'text/plain', 'etag_t', 0, 0),
('m', self.ts_encoded(), 33, 'text/plain', 'etag_m', 0, 0),
('w', self.ts_encoded(), 100, 'text/plain', 'etag_w', 0, 0),
]
for obj in objects:
broker.put_object(*obj)
acceptor_epoch = next(self.ts_iter)
acceptor = ShardRange('.shards_a/acceptor', Timestamp.now(),
'here', 'yonder', '1000', '11111',
state=ShardRange.ACTIVE, epoch=acceptor_epoch)
db_hash = hash_path(acceptor.account, acceptor.container)
# NB expected cleave db includes acceptor epoch
expected_shard_db = os.path.join(
self.tempdir, 'sda', 'containers', '0', db_hash[-3:], db_hash,
'%s_%s.db' % (db_hash, acceptor_epoch.internal))
acceptors = [
ShardRange('.shards_a/acceptor_%s_%s' % (unique[0], bounds[1]),
Timestamp.now(), bounds[0], bounds[1],
'1000', '11111',
state=acceptor_state, epoch=acceptor_epoch)
for bounds in acceptor_bounds]
# by default expect cleaving to progress through all acceptors
if exp_progress_bounds is None:
exp_progress_acceptors = acceptors
else:
exp_progress_acceptors = [
ShardRange(
'.shards_a/acceptor_%s_%s' % (unique[0], bounds[1]),
Timestamp.now(), bounds[0], bounds[1], '1000', '11111',
state=acceptor_state, epoch=acceptor_epoch)
for bounds in exp_progress_bounds]
expected_acceptor_dbs = []
for acceptor in exp_progress_acceptors:
db_hash = hash_path(acceptor.account,
acceptor.container)
# NB expected cleaved db name includes acceptor epoch
db_name = '%s_%s.db' % (db_hash, acceptor_epoch.internal)
expected_acceptor_dbs.append(
os.path.join(self.tempdir, 'sda', 'containers', '0',
db_hash[-3:], db_hash, db_name))
broker.merge_shard_ranges([acceptor])
broker.merge_shard_ranges(acceptors)
broker.set_sharding_state()
# run cleave
with mock_timestamp_now_with_iter(self.ts_iter):
with self._mock_sharder() as sharder:
self.assertTrue(sharder._cleave(broker))
sharder.cleave_batch_size = 3
self.assertEqual(expect_delete, sharder._cleave(broker))
# check the cleave context and source broker
context = CleavingContext.load(broker)
self.assertTrue(context.misplaced_done)
self.assertTrue(context.cleaving_done)
self.assertEqual(acceptor.upper_str, context.cursor)
self.assertEqual(2, context.cleave_to_row)
self.assertEqual(2, context.max_row)
self.assertEqual(expect_delete, context.cleaving_done)
if exp_progress_acceptors:
expected_cursor = exp_progress_acceptors[-1].upper_str
else:
expected_cursor = own_shard_range.lower_str
self.assertEqual(expected_cursor, context.cursor)
self.assertEqual(3, context.cleave_to_row)
self.assertEqual(3, context.max_row)
self.assertEqual(SHARDING, broker.get_db_state())
own_sr = broker.get_own_shard_range()
if expect_delete and len(acceptor_bounds) == 1:
self.assertTrue(own_sr.deleted)
self.assertEqual(ShardRange.SHRUNK, own_sr.state)
else:
self.assertFalse(own_sr.deleted)
self.assertEqual(ShardRange.SHRINKING, own_sr.state)
# check the acceptor db's
sharder._replicate_object.assert_has_calls(
[mock.call(0, expected_shard_db, 0)])
shard_broker = ContainerBroker(expected_shard_db)
# NB when cleaving a shard container to a larger acceptor namespace
# then expect the shard broker's own shard range to reflect that of the
# acceptor shard range rather than being set to CLEAVED.
self.assertEqual(
ShardRange.ACTIVE, shard_broker.get_own_shard_range().state)
[mock.call(0, acceptor_db, 0)
for acceptor_db in expected_acceptor_dbs])
for acceptor_db in expected_acceptor_dbs:
self.assertTrue(os.path.exists(acceptor_db))
# NB when *shrinking* a shard container then expect the
# acceptor broker's own shard range state to remain in the
# original state of the acceptor shard range rather than being
# set to CLEAVED as it would when *sharding*.
acceptor_broker = ContainerBroker(acceptor_db)
self.assertEqual(acceptor_state,
acceptor_broker.get_own_shard_range().state)
acceptor_ranges = acceptor_broker.get_shard_ranges(
include_deleted=True)
if expect_delete and len(acceptor_bounds) == 1:
# special case when deleted shrinking shard range is
# forwarded to single enclosing acceptor
self.assertEqual([own_sr], acceptor_ranges)
self.assertTrue(acceptor_ranges[0].deleted)
self.assertEqual(ShardRange.SHRUNK,
acceptor_ranges[0].state)
else:
self.assertEqual([], acceptor_ranges)
expected_objects = [
obj for obj in objects
if any(acceptor.lower < obj[0] <= acceptor.upper
for acceptor in exp_progress_acceptors)
]
self._check_objects(expected_objects, expected_acceptor_dbs)
# check that *shrinking* shard's copies of acceptor ranges are not
# updated as they would be if *sharding*
updated_shard_ranges = broker.get_shard_ranges()
self.assertEqual([dict(sr) for sr in acceptors],
[dict(sr) for sr in updated_shard_ranges])
# check that *shrinking* shard's copies of acceptor ranges are not
# updated when completing sharding as they would be if *sharding*
with mock_timestamp_now_with_iter(self.ts_iter):
sharder._complete_sharding(broker)
updated_shard_ranges = broker.get_shard_ranges()
self.assertEqual(1, len(updated_shard_ranges))
self.assertEqual(dict(acceptor), dict(updated_shard_ranges[0]))
self.assertEqual([dict(sr) for sr in acceptors],
[dict(sr) for sr in updated_shard_ranges])
own_sr = broker.get_own_shard_range()
self.assertEqual(expect_delete, own_sr.deleted)
if expect_delete:
self.assertEqual(ShardRange.SHRUNK, own_sr.state)
else:
self.assertEqual(ShardRange.SHRINKING, own_sr.state)
# shard range should have unmodified acceptor, bytes used and
# meta_timestamp
self._check_objects(objects, expected_shard_db)
# note: shrinking shard bounds are (h, w)
# shrinking to a single acceptor with enclosing namespace
expect_delete = True
do_test(ShardRange.CREATED, (('h', ''),), expect_delete)
do_test(ShardRange.CLEAVED, (('h', ''),), expect_delete)
do_test(ShardRange.ACTIVE, (('h', ''),), expect_delete)
# shrinking to multiple acceptors that enclose namespace
do_test(ShardRange.CREATED, (('d', 'k'), ('k', '')), expect_delete)
do_test(ShardRange.CLEAVED, (('d', 'k'), ('k', '')), expect_delete)
do_test(ShardRange.ACTIVE, (('d', 'k'), ('k', '')), expect_delete)
do_test(ShardRange.CLEAVED, (('d', 'k'), ('k', 't'), ('t', '')),
expect_delete)
do_test(ShardRange.CREATED, (('d', 'k'), ('k', 't'), ('t', '')),
expect_delete)
do_test(ShardRange.ACTIVE, (('d', 'k'), ('k', 't'), ('t', '')),
expect_delete)
# shrinking to incomplete acceptors, gap at end of namespace
expect_delete = False
do_test(ShardRange.CREATED, (('d', 'k'),), expect_delete)
do_test(ShardRange.CLEAVED, (('d', 'k'), ('k', 't')), expect_delete)
# shrinking to incomplete acceptors, gap at start and end of namespace
do_test(ShardRange.CREATED, (('k', 't'),), expect_delete,
exp_progress_bounds=(('k', 't'),))
# shrinking to incomplete acceptors, gap at start of namespace
expect_delete = True
do_test(ShardRange.CLEAVED, (('k', 't'), ('t', '')), expect_delete,
exp_progress_bounds=(('k', 't'), ('t', '')))
# shrinking to incomplete acceptors, gap in middle
do_test(ShardRange.CLEAVED, (('d', 'k'), ('t', '')), expect_delete,
exp_progress_bounds=(('d', 'k'), ('t', '')))
def test_cleave_repeated(self):
# verify that if new objects are merged into retiring db after cleaving