Merge "Accept a trade off of dispersion for balance"

This commit is contained in:
Jenkins 2017-09-26 22:12:11 +00:00 committed by Gerrit Code Review
commit bb9a06ce14
4 changed files with 195 additions and 36 deletions

View File

@ -525,7 +525,9 @@ class RingBuilder(object):
# we'll gather a few times, or until we archive the plan
for gather_count in range(MAX_BALANCE_GATHER_COUNT):
self._gather_parts_for_balance(assign_parts, replica_plan)
self._gather_parts_for_balance(assign_parts, replica_plan,
# firsrt attempt go for disperse
gather_count == 0)
if not assign_parts:
# most likely min part hours
finish_status = 'Unable to finish'
@ -1097,6 +1099,12 @@ class RingBuilder(object):
:param start: offset into self.parts to begin search
:param replica_plan: replicanth targets for tiers
"""
tier2children = self._build_tier2children()
parts_wanted_in_tier = defaultdict(int)
for dev in self._iter_devs():
wanted = max(dev['parts_wanted'], 0)
for tier in dev['tiers']:
parts_wanted_in_tier[tier] += wanted
# Last, we gather partitions from devices that are "overweight" because
# they have more partitions than their parts_wanted.
for offset in range(self.parts):
@ -1128,8 +1136,17 @@ class RingBuilder(object):
replicas_at_tier[tier] <
replica_plan[tier]['max']
for tier in dev['tiers']):
# we're stuck by replica plan
continue
for t in reversed(dev['tiers']):
if replicas_at_tier[t] - 1 < replica_plan[t]['min']:
# we're stuck at tier t
break
if sum(parts_wanted_in_tier[c]
for c in tier2children[t]
if c not in dev['tiers']) <= 0:
# we're stuck by weight
continue
# this is the most overweight_device holding a replica
# of this part that can shed it according to the plan
dev['parts_wanted'] += 1
@ -1141,15 +1158,19 @@ class RingBuilder(object):
self._replica2part2dev[replica][part] = NONE_DEV
for tier in dev['tiers']:
replicas_at_tier[tier] -= 1
parts_wanted_in_tier[tier] -= 1
self._set_part_moved(part)
break
def _gather_parts_for_balance(self, assign_parts, replica_plan):
def _gather_parts_for_balance(self, assign_parts, replica_plan,
disperse_first):
"""
Gather parts that look like they should move for balance reasons.
A simple gathers of parts that looks dispersible normally works out,
we'll switch strategies if things don't seem to move.
:param disperse_first: boolean, avoid replicas on overweight devices
that need to be there for dispersion
"""
# pick a random starting point on the other side of the ring
quarter_turn = (self.parts // 4)
@ -1162,10 +1183,10 @@ class RingBuilder(object):
'last_start': self._last_part_gather_start})
self._last_part_gather_start = start
self._gather_parts_for_balance_can_disperse(
assign_parts, start, replica_plan)
if not assign_parts:
self._gather_parts_for_balance_forced(assign_parts, start)
if disperse_first:
self._gather_parts_for_balance_can_disperse(
assign_parts, start, replica_plan)
self._gather_parts_for_balance_forced(assign_parts, start)
def _gather_parts_for_balance_forced(self, assign_parts, start, **kwargs):
"""

View File

@ -400,7 +400,7 @@ class TestRingBuilder(unittest.TestCase):
for dev in rb._iter_devs():
dev['tiers'] = utils.tiers_for_dev(dev)
assign_parts = defaultdict(list)
rb._gather_parts_for_balance(assign_parts, replica_plan)
rb._gather_parts_for_balance(assign_parts, replica_plan, False)
max_run = 0
run = 0
last_part = 0
@ -1621,9 +1621,7 @@ class TestRingBuilder(unittest.TestCase):
rb.rebalance(seed=12345)
part_counts = self._partition_counts(rb, key='zone')
self.assertEqual(part_counts[0], 212)
self.assertEqual(part_counts[1], 211)
self.assertEqual(part_counts[2], 345)
self.assertEqual({0: 212, 1: 211, 2: 345}, part_counts)
# Now, devices 0 and 1 take 50% more than their fair shares by
# weight.
@ -1633,9 +1631,7 @@ class TestRingBuilder(unittest.TestCase):
rb.rebalance(seed=12345)
part_counts = self._partition_counts(rb, key='zone')
self.assertEqual(part_counts[0], 256)
self.assertEqual(part_counts[1], 256)
self.assertEqual(part_counts[2], 256)
self.assertEqual({0: 256, 1: 256, 2: 256}, part_counts)
# Devices 0 and 1 may take up to 75% over their fair share, but the
# placement algorithm only wants to spread things out evenly between
@ -1698,9 +1694,12 @@ class TestRingBuilder(unittest.TestCase):
rb.rebalance(seed=12345)
part_counts = self._partition_counts(rb, key='ip')
self.assertEqual(part_counts['127.0.0.1'], 238)
self.assertEqual(part_counts['127.0.0.2'], 237)
self.assertEqual(part_counts['127.0.0.3'], 293)
self.assertEqual({
'127.0.0.1': 237,
'127.0.0.2': 237,
'127.0.0.3': 294,
}, part_counts)
# Even out the weights: balance becomes perfect
for dev in rb.devs:
@ -2451,6 +2450,105 @@ class TestRingBuilder(unittest.TestCase):
(0, 0, '127.0.0.1', 3): [0, 256, 0, 0],
})
def test_undispersable_zone_converge_on_balance(self):
rb = ring.RingBuilder(8, 6, 0)
dev_id = 0
# 3 regions, 2 zone for each region, 1 server with only *one* device in
# each zone (this is an absolutely pathological case)
for r in range(3):
for z in range(2):
ip = '127.%s.%s.1' % (r, z)
dev_id += 1
rb.add_dev({'id': dev_id, 'region': r, 'zone': z,
'weight': 1000, 'ip': ip, 'port': 10000,
'device': 'd%s' % dev_id})
rb.rebalance(seed=7)
# sanity, all balanced and 0 dispersion
self.assertEqual(rb.get_balance(), 0)
self.assertEqual(rb.dispersion, 0)
# add one device to the server in z1 for each region, N.B. when we
# *balance* this topology we will have very bad dispersion (too much
# weight in z1 compared to z2!)
for r in range(3):
z = 0
ip = '127.%s.%s.1' % (r, z)
dev_id += 1
rb.add_dev({'id': dev_id, 'region': r, 'zone': z,
'weight': 1000, 'ip': ip, 'port': 10000,
'device': 'd%s' % dev_id})
changed_part, _, _ = rb.rebalance(seed=7)
# sanity, all part but only one replica moved to new devices
self.assertEqual(changed_part, 2 ** 8)
# so the first time, rings are still unbalanced becase we'll only move
# one replica of each part.
self.assertEqual(rb.get_balance(), 50.1953125)
self.assertEqual(rb.dispersion, 99.609375)
# N.B. since we mostly end up grabbing parts by "weight forced" some
# seeds given some specific ring state will randomly pick bad
# part-replicas that end up going back down onto the same devices
changed_part, _, _ = rb.rebalance(seed=7)
self.assertEqual(changed_part, 14)
# ... this isn't a really "desirable" behavior, but even with bad luck,
# things do get better
self.assertEqual(rb.get_balance(), 47.265625)
self.assertEqual(rb.dispersion, 99.609375)
# but if you stick with it, eventually the next rebalance, will get to
# move "the right" part-replicas, resulting in near optimal balance
changed_part, _, _ = rb.rebalance(seed=7)
self.assertEqual(changed_part, 240)
self.assertEqual(rb.get_balance(), 0.390625)
self.assertEqual(rb.dispersion, 99.609375)
def test_undispersable_server_converge_on_balance(self):
rb = ring.RingBuilder(8, 6, 0)
dev_id = 0
# 3 zones, 2 server for each zone, 2 device for each server
for z in range(3):
for i in range(2):
ip = '127.0.%s.%s' % (z, i + 1)
for d in range(2):
dev_id += 1
rb.add_dev({'id': dev_id, 'region': 1, 'zone': z,
'weight': 1000, 'ip': ip, 'port': 10000,
'device': 'd%s' % dev_id})
rb.rebalance(seed=7)
# sanity, all balanced and 0 dispersion
self.assertEqual(rb.get_balance(), 0)
self.assertEqual(rb.dispersion, 0)
# add one device for first server for each zone
for z in range(3):
ip = '127.0.%s.1' % z
dev_id += 1
rb.add_dev({'id': dev_id, 'region': 1, 'zone': z,
'weight': 1000, 'ip': ip, 'port': 10000,
'device': 'd%s' % dev_id})
changed_part, _, _ = rb.rebalance(seed=7)
# sanity, all part but only one replica moved to new devices
self.assertEqual(changed_part, 2 ** 8)
# but the first time, those are still unbalance becase ring builder
# can move only one replica for each part
self.assertEqual(rb.get_balance(), 16.9921875)
self.assertEqual(rb.dispersion, 59.765625)
rb.rebalance(seed=7)
# converge into around 0~1
self.assertGreaterEqual(rb.get_balance(), 0)
self.assertLess(rb.get_balance(), 1)
# dispersion doesn't get any worse
self.assertEqual(rb.dispersion, 59.765625)
def test_effective_overload(self):
rb = ring.RingBuilder(8, 3, 1)
# z0
@ -3595,12 +3693,12 @@ class TestGetRequiredOverload(unittest.TestCase):
rb.rebalance(seed=17)
self.assertEqual(rb.get_balance(), 1581.6406249999998)
# but despite the overall trend toward imbalance, in the tier
# with the huge device, the small device is trying to shed parts
# as effectively as it can (which would be useful if it was the
# only small device isolated in a tier with other huge devices
# trying to gobble up all the replicanths in the tier - see
# `test_one_small_guy_does_not_spoil_his_buddy`!)
# but despite the overall trend toward imbalance, in the tier with the
# huge device, we want to see the small device (d4) try to shed parts
# as effectively as it can to the huge device in the same tier (d5)
# this is a useful behavior anytime when for whatever reason a device
# w/i a tier wants parts from another device already in the same tier
# another example is `test_one_small_guy_does_not_spoil_his_buddy`
expected = {
0: 123,
1: 123,
@ -3691,6 +3789,45 @@ class TestGetRequiredOverload(unittest.TestCase):
self.assertEqual(rb.get_balance(), 30.46875)
# increasing overload moves towards one replica in each tier
rb.set_overload(0.3)
expected = {
0: 0.553443113772455,
1: 0.553443113772455,
2: 0.553443113772455,
3: 0.553443113772455,
4: 0.778443113772455,
5: 0.007784431137724551,
}
target_replicas = rb._build_target_replicas_by_tier()
self.assertEqual(expected, {t[-1]: r for (t, r) in
target_replicas.items()
if len(t) == 4})
# ... and as always increasing overload makes balance *worse*
rb.rebalance(seed=12)
self.assertEqual(rb.get_balance(), 30.46875)
# the little guy it really struggling to take his share tho
expected = {
0: 142,
1: 141,
2: 142,
3: 141,
4: 200,
5: 2,
}
self.assertEqual(expected, {
d['id']: d['parts'] for d in rb._iter_devs()})
# ... and you can see it in the balance!
expected = {
0: -7.367187499999986,
1: -8.019531249999986,
2: -7.367187499999986,
3: -8.019531249999986,
4: 30.46875,
5: 30.46875,
}
self.assertEqual(expected, rb._build_balance_per_dev())
rb.set_overload(0.5)
expected = {
0: 0.5232035928143712,
@ -3705,7 +3842,7 @@ class TestGetRequiredOverload(unittest.TestCase):
target_replicas.items()
if len(t) == 4})
# ... and as always increasing overload makes balance *worse*
# because the device is so small, balance get's bad quick
rb.rebalance(seed=17)
self.assertEqual(rb.get_balance(), 95.703125)

View File

@ -1115,7 +1115,7 @@ class TestCooperativeRingBuilder(BaseTestCompositeBuilder):
rb1.rebalance()
self.assertEqual([rb1], update_calls)
self.assertEqual([rb1], can_part_move_calls.keys())
self.assertEqual(512, len(can_part_move_calls[rb1]))
self.assertEqual(768, len(can_part_move_calls[rb1]))
# two component builders with same parent builder
cb = CompositeRingBuilder()
@ -1139,8 +1139,8 @@ class TestCooperativeRingBuilder(BaseTestCompositeBuilder):
# rb1 is being rebalanced so gets checked, and rb2 also gets checked
self.assertEqual(sorted([rb1, rb2]), sorted(can_part_move_calls))
self.assertEqual(512, len(can_part_move_calls[rb1]))
self.assertEqual(512, len(can_part_move_calls[rb2]))
self.assertEqual(768, len(can_part_move_calls[rb1]))
self.assertEqual(768, len(can_part_move_calls[rb2]))
def test_save_then_load(self):
cb = CompositeRingBuilder()

View File

@ -619,10 +619,10 @@ class TestUtils(unittest.TestCase):
rb.rebalance(seed=100)
rb.validate()
self.assertEqual(rb.dispersion, 39.84375)
self.assertEqual(rb.dispersion, 55.46875)
report = dispersion_report(rb)
self.assertEqual(report['worst_tier'], 'r1z1')
self.assertEqual(report['max_dispersion'], 39.84375)
self.assertEqual(report['max_dispersion'], 44.921875)
def build_tier_report(max_replicas, placed_parts, dispersion,
replicas):
@ -633,16 +633,17 @@ class TestUtils(unittest.TestCase):
'replicas': replicas,
}
# Each node should store 256 partitions to avoid multiple replicas
# Each node should store less than or equal to 256 partitions to
# avoid multiple replicas.
# 2/5 of total weight * 768 ~= 307 -> 51 partitions on each node in
# zone 1 are stored at least twice on the nodes
expected = [
['r1z1', build_tier_report(
2, 256, 39.84375, [0, 0, 154, 102])],
2, 256, 44.921875, [0, 0, 141, 115])],
['r1z1-127.0.0.1', build_tier_report(
1, 256, 19.921875, [0, 205, 51, 0])],
1, 242, 29.33884297520661, [14, 171, 71, 0])],
['r1z1-127.0.0.2', build_tier_report(
1, 256, 19.921875, [0, 205, 51, 0])],
1, 243, 29.218106995884774, [13, 172, 71, 0])],
]
report = dispersion_report(rb, 'r1z1[^/]*$', verbose=True)
graph = report['graph']
@ -667,9 +668,9 @@ class TestUtils(unittest.TestCase):
# can't move all the part-replicas in one rebalance
rb.rebalance(seed=100)
report = dispersion_report(rb, verbose=True)
self.assertEqual(rb.dispersion, 9.375)
self.assertEqual(report['worst_tier'], 'r1z1-127.0.0.1')
self.assertEqual(report['max_dispersion'], 7.18562874251497)
self.assertEqual(rb.dispersion, 11.71875)
self.assertEqual(report['worst_tier'], 'r1z1-127.0.0.2')
self.assertEqual(report['max_dispersion'], 8.875739644970414)
# do a sencond rebalance
rb.rebalance(seed=100)
report = dispersion_report(rb, verbose=True)