Merge "Let admins add a region without melting their cluster"
This commit is contained in:
commit
4758eaac3f
@ -326,8 +326,7 @@ class RingBuilder(object):
|
||||
that before). Because of this, it keeps rebalancing until the device
|
||||
skew (number of partitions a device wants compared to what it has) gets
|
||||
below 1% or doesn't change by more than 1% (only happens with ring that
|
||||
can't be balanced no matter what -- like with 3 zones of differing
|
||||
weights with replicas set to 3).
|
||||
can't be balanced no matter what).
|
||||
|
||||
:returns: (number_of_partitions_altered, resulting_balance)
|
||||
"""
|
||||
@ -517,8 +516,18 @@ class RingBuilder(object):
|
||||
# indicate its strong desire to give up everything it has.
|
||||
dev['parts_wanted'] = -self.parts * self.replicas
|
||||
else:
|
||||
dev['parts_wanted'] = \
|
||||
int(weight_of_one_part * dev['weight']) - dev['parts']
|
||||
dev['parts_wanted'] = (
|
||||
# Round up here so that every partition ultimately ends up
|
||||
# with a placement.
|
||||
#
|
||||
# Imagine 5 partitions to be placed on 4 devices. If we
|
||||
# didn't use math.ceil() here, each device would have a
|
||||
# parts_wanted of 1, so 4 partitions would be placed but
|
||||
# the last would not, probably resulting in a crash. This
|
||||
# way, some devices end up with leftover parts_wanted, but
|
||||
# at least every partition ends up somewhere.
|
||||
int(math.ceil(weight_of_one_part * dev['weight'])) -
|
||||
dev['parts'])
|
||||
|
||||
def _adjust_replica2part2dev_size(self):
|
||||
"""
|
||||
@ -754,9 +763,24 @@ class RingBuilder(object):
|
||||
replicas_to_replace may be shared for multiple
|
||||
partitions, so be sure you do not modify it.
|
||||
"""
|
||||
parts_available_in_tier = defaultdict(int)
|
||||
for dev in self._iter_devs():
|
||||
dev['sort_key'] = self._sort_key_for(dev)
|
||||
dev['tiers'] = tiers_for_dev(dev)
|
||||
tiers = tiers_for_dev(dev)
|
||||
dev['tiers'] = tiers
|
||||
for tier in tiers:
|
||||
# Note: this represents how many partitions may be assigned to
|
||||
# a given tier (region/zone/server/disk). It does not take
|
||||
# into account how many partitions a given tier wants to shed.
|
||||
#
|
||||
# If we did not do this, we could have a zone where, at some
|
||||
# point during assignment, number-of-parts-to-gain equals
|
||||
# number-of-parts-to-shed. At that point, no further placement
|
||||
# into that zone would occur since its parts_available_in_tier
|
||||
# would be 0. This would happen any time a zone had any device
|
||||
# with partitions to shed, which is any time a device is being
|
||||
# removed, which is a pretty frequent operation.
|
||||
parts_available_in_tier[tier] += max(dev['parts_wanted'], 0)
|
||||
|
||||
available_devs = \
|
||||
sorted((d for d in self._iter_devs() if d['weight']),
|
||||
@ -795,23 +819,25 @@ class RingBuilder(object):
|
||||
# Gather up what other tiers (regions, zones, ip/ports, and
|
||||
# devices) the replicas not-to-be-moved are in for this part.
|
||||
other_replicas = defaultdict(int)
|
||||
unique_tiers_by_tier_len = defaultdict(set)
|
||||
occupied_tiers_by_tier_len = defaultdict(set)
|
||||
for replica in self._replicas_for_part(part):
|
||||
if replica not in replace_replicas:
|
||||
dev = self.devs[self._replica2part2dev[replica][part]]
|
||||
for tier in dev['tiers']:
|
||||
other_replicas[tier] += 1
|
||||
unique_tiers_by_tier_len[len(tier)].add(tier)
|
||||
occupied_tiers_by_tier_len[len(tier)].add(tier)
|
||||
|
||||
for replica in replace_replicas:
|
||||
# Find a new home for this replica
|
||||
tier = ()
|
||||
depth = 1
|
||||
while depth <= max_tier_depth:
|
||||
# Order the tiers by how many replicas of this
|
||||
# partition they already have. Then, of the ones
|
||||
# with the smallest number of replicas, pick the
|
||||
# tier with the hungriest drive and then continue
|
||||
# searching in that subtree.
|
||||
# with the smallest number of replicas and that have
|
||||
# room to accept more partitions, pick the tier with
|
||||
# the hungriest drive and then continue searching in
|
||||
# that subtree.
|
||||
#
|
||||
# There are other strategies we could use here,
|
||||
# such as hungriest-tier (i.e. biggest
|
||||
@ -819,10 +845,11 @@ class RingBuilder(object):
|
||||
# However, hungriest-drive is what was used here
|
||||
# before, and it worked pretty well in practice.
|
||||
#
|
||||
# Note that this allocator will balance things as
|
||||
# evenly as possible at each level of the device
|
||||
# layout. If your layout is extremely unbalanced,
|
||||
# this may produce poor results.
|
||||
# Note that this allocator prioritizes even device
|
||||
# filling over dispersion, so if your layout is
|
||||
# extremely unbalanced, you may not get the replica
|
||||
# dispersion that you expect, and your durability
|
||||
# may be lessened.
|
||||
#
|
||||
# This used to be a cute, recursive function, but it's been
|
||||
# unrolled for performance.
|
||||
@ -834,18 +861,28 @@ class RingBuilder(object):
|
||||
# short-circuit the search while still ensuring we get the
|
||||
# right tier.
|
||||
candidates_with_replicas = \
|
||||
unique_tiers_by_tier_len[len(tier) + 1]
|
||||
# Find a tier with the minimal replica count and the
|
||||
# hungriest drive among all the tiers with the minimal
|
||||
# replica count.
|
||||
if len(tier2children[tier]) > \
|
||||
occupied_tiers_by_tier_len[len(tier) + 1]
|
||||
|
||||
# Among the tiers with room for more partitions,
|
||||
# find one with the smallest possible number of
|
||||
# replicas already in it, breaking ties by which one
|
||||
# has the hungriest drive.
|
||||
candidates_with_room = [
|
||||
t for t in tier2children[tier]
|
||||
if parts_available_in_tier[t] > 0]
|
||||
|
||||
if len(candidates_with_room) > \
|
||||
len(candidates_with_replicas):
|
||||
# There exists at least one tier with 0 other replicas
|
||||
tier = max((t for t in tier2children[tier]
|
||||
# There exists at least one tier with room for
|
||||
# another partition and 0 other replicas already
|
||||
# in it, so we can use a faster search. The else
|
||||
# branch's search would work here, but it's
|
||||
# significantly slower.
|
||||
tier = max((t for t in candidates_with_room
|
||||
if other_replicas[t] == 0),
|
||||
key=tier2sort_key.__getitem__)
|
||||
else:
|
||||
tier = max(tier2children[tier],
|
||||
tier = max(candidates_with_room,
|
||||
key=lambda t: (-other_replicas[t],
|
||||
tier2sort_key[t]))
|
||||
depth += 1
|
||||
@ -855,8 +892,9 @@ class RingBuilder(object):
|
||||
old_sort_key = dev['sort_key']
|
||||
new_sort_key = dev['sort_key'] = self._sort_key_for(dev)
|
||||
for tier in dev['tiers']:
|
||||
parts_available_in_tier[tier] -= 1
|
||||
other_replicas[tier] += 1
|
||||
unique_tiers_by_tier_len[len(tier)].add(tier)
|
||||
occupied_tiers_by_tier_len[len(tier)].add(tier)
|
||||
|
||||
index = bisect.bisect_left(tier2dev_sort_key[tier],
|
||||
old_sort_key)
|
||||
|
@ -35,6 +35,19 @@ class TestRingBuilder(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
rmtree(self.testdir, ignore_errors=1)
|
||||
|
||||
def _get_population_by_region(self, builder):
|
||||
"""
|
||||
Returns a dictionary mapping region to number of partitions in that
|
||||
region.
|
||||
"""
|
||||
population_by_region = defaultdict(int)
|
||||
r = builder.get_ring()
|
||||
for part2dev_id in r._replica2part2dev_id:
|
||||
for dev_id in part2dev_id:
|
||||
dev = r.devs[dev_id]
|
||||
population_by_region[dev['region']] += 1
|
||||
return dict(population_by_region.items())
|
||||
|
||||
def test_init(self):
|
||||
rb = ring.RingBuilder(8, 3, 1)
|
||||
self.assertEquals(rb.part_power, 8)
|
||||
@ -641,6 +654,92 @@ class TestRingBuilder(unittest.TestCase):
|
||||
|
||||
rb.rebalance()
|
||||
|
||||
def test_region_fullness_with_balanceable_ring(self):
|
||||
rb = ring.RingBuilder(8, 3, 1)
|
||||
rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 1,
|
||||
'ip': '127.0.0.1', 'port': 10000, 'device': 'sda1'})
|
||||
rb.add_dev({'id': 1, 'region': 0, 'zone': 1, 'weight': 1,
|
||||
'ip': '127.0.0.1', 'port': 10001, 'device': 'sda1'})
|
||||
|
||||
rb.add_dev({'id': 2, 'region': 1, 'zone': 0, 'weight': 1,
|
||||
'ip': '127.0.0.1', 'port': 10003, 'device': 'sda1'})
|
||||
rb.add_dev({'id': 3, 'region': 1, 'zone': 1, 'weight': 1,
|
||||
'ip': '127.0.0.1', 'port': 10004, 'device': 'sda1'})
|
||||
|
||||
rb.add_dev({'id': 4, 'region': 2, 'zone': 0, 'weight': 1,
|
||||
'ip': '127.0.0.1', 'port': 10005, 'device': 'sda1'})
|
||||
rb.add_dev({'id': 5, 'region': 2, 'zone': 1, 'weight': 1,
|
||||
'ip': '127.0.0.1', 'port': 10006, 'device': 'sda1'})
|
||||
|
||||
rb.add_dev({'id': 6, 'region': 3, 'zone': 0, 'weight': 1,
|
||||
'ip': '127.0.0.1', 'port': 10007, 'device': 'sda1'})
|
||||
rb.add_dev({'id': 7, 'region': 3, 'zone': 1, 'weight': 1,
|
||||
'ip': '127.0.0.1', 'port': 10008, 'device': 'sda1'})
|
||||
rb.rebalance(seed=2)
|
||||
|
||||
population_by_region = self._get_population_by_region(rb)
|
||||
self.assertEquals(population_by_region,
|
||||
{0: 192, 1: 192, 2: 192, 3: 192})
|
||||
|
||||
def test_region_fullness_with_unbalanceable_ring(self):
|
||||
rb = ring.RingBuilder(8, 3, 1)
|
||||
rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 2,
|
||||
'ip': '127.0.0.1', 'port': 10000, 'device': 'sda1'})
|
||||
rb.add_dev({'id': 1, 'region': 0, 'zone': 1, 'weight': 2,
|
||||
'ip': '127.0.0.1', 'port': 10001, 'device': 'sda1'})
|
||||
|
||||
rb.add_dev({'id': 2, 'region': 1, 'zone': 0, 'weight': 1,
|
||||
'ip': '127.0.0.1', 'port': 10003, 'device': 'sda1'})
|
||||
rb.add_dev({'id': 3, 'region': 1, 'zone': 1, 'weight': 1,
|
||||
'ip': '127.0.0.1', 'port': 10004, 'device': 'sda1'})
|
||||
rb.rebalance(seed=2)
|
||||
|
||||
population_by_region = self._get_population_by_region(rb)
|
||||
self.assertEquals(population_by_region, {0: 512, 1: 256})
|
||||
|
||||
def test_adding_region_slowly_with_unbalanceable_ring(self):
|
||||
rb = ring.RingBuilder(8, 3, 1)
|
||||
rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 2,
|
||||
'ip': '127.0.0.1', 'port': 10000, 'device': 'sda1'})
|
||||
rb.add_dev({'id': 1, 'region': 0, 'zone': 1, 'weight': 2,
|
||||
'ip': '127.0.0.1', 'port': 10001, 'device': 'sda1'})
|
||||
rb.rebalance()
|
||||
|
||||
rb.add_dev({'id': 2, 'region': 1, 'zone': 0, 'weight': 0.25,
|
||||
'ip': '127.0.0.1', 'port': 10003, 'device': 'sda1'})
|
||||
rb.add_dev({'id': 3, 'region': 1, 'zone': 1, 'weight': 0.25,
|
||||
'ip': '127.0.0.1', 'port': 10004, 'device': 'sda1'})
|
||||
rb.pretend_min_part_hours_passed()
|
||||
rb.rebalance(seed=2)
|
||||
|
||||
# there's not enough room in r1 for every partition to have a replica
|
||||
# in it, so only 86 assignments occur in r1 (that's ~1/5 of the total,
|
||||
# since r1 has 1/5 of the weight).
|
||||
population_by_region = self._get_population_by_region(rb)
|
||||
self.assertEquals(population_by_region, {0: 682, 1: 86})
|
||||
|
||||
# and since there's not enough room, subsequent rebalances will not
|
||||
# cause additional assignments to r1
|
||||
rb.pretend_min_part_hours_passed()
|
||||
rb.rebalance(seed=2)
|
||||
population_by_region = self._get_population_by_region(rb)
|
||||
self.assertEquals(population_by_region, {0: 682, 1: 86})
|
||||
|
||||
# after you add more weight, more partition assignments move
|
||||
rb.set_dev_weight(2, 0.5)
|
||||
rb.set_dev_weight(3, 0.5)
|
||||
rb.pretend_min_part_hours_passed()
|
||||
rb.rebalance(seed=2)
|
||||
population_by_region = self._get_population_by_region(rb)
|
||||
self.assertEquals(population_by_region, {0: 614, 1: 154})
|
||||
|
||||
rb.set_dev_weight(2, 1.0)
|
||||
rb.set_dev_weight(3, 1.0)
|
||||
rb.pretend_min_part_hours_passed()
|
||||
rb.rebalance(seed=2)
|
||||
population_by_region = self._get_population_by_region(rb)
|
||||
self.assertEquals(population_by_region, {0: 512, 1: 256})
|
||||
|
||||
def test_set_replicas_increase(self):
|
||||
rb = ring.RingBuilder(8, 2, 0)
|
||||
rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 1,
|
||||
|
@ -703,7 +703,10 @@ class TestRing(TestRingBase):
|
||||
for region in xrange(1, 5):
|
||||
rb.add_dev({'id': next_dev_id,
|
||||
'ip': '1.%d.1.%d' % (region, server), 'port': 1234,
|
||||
'zone': 1, 'region': region, 'weight': 1.0})
|
||||
# 108.0 is the weight of all devices created prior to
|
||||
# this test in region 0; this way all regions have
|
||||
# equal combined weight
|
||||
'zone': 1, 'region': region, 'weight': 108.0})
|
||||
next_dev_id += 1
|
||||
rb.pretend_min_part_hours_passed()
|
||||
rb.rebalance(seed=1)
|
||||
|
Loading…
x
Reference in New Issue
Block a user