diff --git a/swift/common/ring/builder.py b/swift/common/ring/builder.py index 141ae7440b..ee10a934c2 100644 --- a/swift/common/ring/builder.py +++ b/swift/common/ring/builder.py @@ -326,8 +326,7 @@ class RingBuilder(object): that before). Because of this, it keeps rebalancing until the device skew (number of partitions a device wants compared to what it has) gets below 1% or doesn't change by more than 1% (only happens with ring that - can't be balanced no matter what -- like with 3 zones of differing - weights with replicas set to 3). + can't be balanced no matter what). :returns: (number_of_partitions_altered, resulting_balance) """ @@ -517,8 +516,18 @@ class RingBuilder(object): # indicate its strong desire to give up everything it has. dev['parts_wanted'] = -self.parts * self.replicas else: - dev['parts_wanted'] = \ - int(weight_of_one_part * dev['weight']) - dev['parts'] + dev['parts_wanted'] = ( + # Round up here so that every partition ultimately ends up + # with a placement. + # + # Imagine 5 partitions to be placed on 4 devices. If we + # didn't use math.ceil() here, each device would have a + # parts_wanted of 1, so 4 partitions would be placed but + # the last would not, probably resulting in a crash. This + # way, some devices end up with leftover parts_wanted, but + # at least every partition ends up somewhere. + int(math.ceil(weight_of_one_part * dev['weight'])) - + dev['parts']) def _adjust_replica2part2dev_size(self): """ @@ -754,9 +763,24 @@ class RingBuilder(object): replicas_to_replace may be shared for multiple partitions, so be sure you do not modify it. """ + parts_available_in_tier = defaultdict(int) for dev in self._iter_devs(): dev['sort_key'] = self._sort_key_for(dev) - dev['tiers'] = tiers_for_dev(dev) + tiers = tiers_for_dev(dev) + dev['tiers'] = tiers + for tier in tiers: + # Note: this represents how many partitions may be assigned to + # a given tier (region/zone/server/disk). It does not take + # into account how many partitions a given tier wants to shed. + # + # If we did not do this, we could have a zone where, at some + # point during assignment, number-of-parts-to-gain equals + # number-of-parts-to-shed. At that point, no further placement + # into that zone would occur since its parts_available_in_tier + # would be 0. This would happen any time a zone had any device + # with partitions to shed, which is any time a device is being + # removed, which is a pretty frequent operation. + parts_available_in_tier[tier] += max(dev['parts_wanted'], 0) available_devs = \ sorted((d for d in self._iter_devs() if d['weight']), @@ -795,23 +819,25 @@ class RingBuilder(object): # Gather up what other tiers (regions, zones, ip/ports, and # devices) the replicas not-to-be-moved are in for this part. other_replicas = defaultdict(int) - unique_tiers_by_tier_len = defaultdict(set) + occupied_tiers_by_tier_len = defaultdict(set) for replica in self._replicas_for_part(part): if replica not in replace_replicas: dev = self.devs[self._replica2part2dev[replica][part]] for tier in dev['tiers']: other_replicas[tier] += 1 - unique_tiers_by_tier_len[len(tier)].add(tier) + occupied_tiers_by_tier_len[len(tier)].add(tier) for replica in replace_replicas: + # Find a new home for this replica tier = () depth = 1 while depth <= max_tier_depth: # Order the tiers by how many replicas of this # partition they already have. Then, of the ones - # with the smallest number of replicas, pick the - # tier with the hungriest drive and then continue - # searching in that subtree. + # with the smallest number of replicas and that have + # room to accept more partitions, pick the tier with + # the hungriest drive and then continue searching in + # that subtree. # # There are other strategies we could use here, # such as hungriest-tier (i.e. biggest @@ -819,10 +845,11 @@ class RingBuilder(object): # However, hungriest-drive is what was used here # before, and it worked pretty well in practice. # - # Note that this allocator will balance things as - # evenly as possible at each level of the device - # layout. If your layout is extremely unbalanced, - # this may produce poor results. + # Note that this allocator prioritizes even device + # filling over dispersion, so if your layout is + # extremely unbalanced, you may not get the replica + # dispersion that you expect, and your durability + # may be lessened. # # This used to be a cute, recursive function, but it's been # unrolled for performance. @@ -834,18 +861,28 @@ class RingBuilder(object): # short-circuit the search while still ensuring we get the # right tier. candidates_with_replicas = \ - unique_tiers_by_tier_len[len(tier) + 1] - # Find a tier with the minimal replica count and the - # hungriest drive among all the tiers with the minimal - # replica count. - if len(tier2children[tier]) > \ + occupied_tiers_by_tier_len[len(tier) + 1] + + # Among the tiers with room for more partitions, + # find one with the smallest possible number of + # replicas already in it, breaking ties by which one + # has the hungriest drive. + candidates_with_room = [ + t for t in tier2children[tier] + if parts_available_in_tier[t] > 0] + + if len(candidates_with_room) > \ len(candidates_with_replicas): - # There exists at least one tier with 0 other replicas - tier = max((t for t in tier2children[tier] + # There exists at least one tier with room for + # another partition and 0 other replicas already + # in it, so we can use a faster search. The else + # branch's search would work here, but it's + # significantly slower. + tier = max((t for t in candidates_with_room if other_replicas[t] == 0), key=tier2sort_key.__getitem__) else: - tier = max(tier2children[tier], + tier = max(candidates_with_room, key=lambda t: (-other_replicas[t], tier2sort_key[t])) depth += 1 @@ -855,8 +892,9 @@ class RingBuilder(object): old_sort_key = dev['sort_key'] new_sort_key = dev['sort_key'] = self._sort_key_for(dev) for tier in dev['tiers']: + parts_available_in_tier[tier] -= 1 other_replicas[tier] += 1 - unique_tiers_by_tier_len[len(tier)].add(tier) + occupied_tiers_by_tier_len[len(tier)].add(tier) index = bisect.bisect_left(tier2dev_sort_key[tier], old_sort_key) diff --git a/test/unit/common/ring/test_builder.py b/test/unit/common/ring/test_builder.py index c42e338910..de05480a55 100644 --- a/test/unit/common/ring/test_builder.py +++ b/test/unit/common/ring/test_builder.py @@ -35,6 +35,19 @@ class TestRingBuilder(unittest.TestCase): def tearDown(self): rmtree(self.testdir, ignore_errors=1) + def _get_population_by_region(self, builder): + """ + Returns a dictionary mapping region to number of partitions in that + region. + """ + population_by_region = defaultdict(int) + r = builder.get_ring() + for part2dev_id in r._replica2part2dev_id: + for dev_id in part2dev_id: + dev = r.devs[dev_id] + population_by_region[dev['region']] += 1 + return dict(population_by_region.items()) + def test_init(self): rb = ring.RingBuilder(8, 3, 1) self.assertEquals(rb.part_power, 8) @@ -641,6 +654,92 @@ class TestRingBuilder(unittest.TestCase): rb.rebalance() + def test_region_fullness_with_balanceable_ring(self): + rb = ring.RingBuilder(8, 3, 1) + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 1, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda1'}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 1, 'weight': 1, + 'ip': '127.0.0.1', 'port': 10001, 'device': 'sda1'}) + + rb.add_dev({'id': 2, 'region': 1, 'zone': 0, 'weight': 1, + 'ip': '127.0.0.1', 'port': 10003, 'device': 'sda1'}) + rb.add_dev({'id': 3, 'region': 1, 'zone': 1, 'weight': 1, + 'ip': '127.0.0.1', 'port': 10004, 'device': 'sda1'}) + + rb.add_dev({'id': 4, 'region': 2, 'zone': 0, 'weight': 1, + 'ip': '127.0.0.1', 'port': 10005, 'device': 'sda1'}) + rb.add_dev({'id': 5, 'region': 2, 'zone': 1, 'weight': 1, + 'ip': '127.0.0.1', 'port': 10006, 'device': 'sda1'}) + + rb.add_dev({'id': 6, 'region': 3, 'zone': 0, 'weight': 1, + 'ip': '127.0.0.1', 'port': 10007, 'device': 'sda1'}) + rb.add_dev({'id': 7, 'region': 3, 'zone': 1, 'weight': 1, + 'ip': '127.0.0.1', 'port': 10008, 'device': 'sda1'}) + rb.rebalance(seed=2) + + population_by_region = self._get_population_by_region(rb) + self.assertEquals(population_by_region, + {0: 192, 1: 192, 2: 192, 3: 192}) + + def test_region_fullness_with_unbalanceable_ring(self): + rb = ring.RingBuilder(8, 3, 1) + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 2, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda1'}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 1, 'weight': 2, + 'ip': '127.0.0.1', 'port': 10001, 'device': 'sda1'}) + + rb.add_dev({'id': 2, 'region': 1, 'zone': 0, 'weight': 1, + 'ip': '127.0.0.1', 'port': 10003, 'device': 'sda1'}) + rb.add_dev({'id': 3, 'region': 1, 'zone': 1, 'weight': 1, + 'ip': '127.0.0.1', 'port': 10004, 'device': 'sda1'}) + rb.rebalance(seed=2) + + population_by_region = self._get_population_by_region(rb) + self.assertEquals(population_by_region, {0: 512, 1: 256}) + + def test_adding_region_slowly_with_unbalanceable_ring(self): + rb = ring.RingBuilder(8, 3, 1) + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 2, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda1'}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 1, 'weight': 2, + 'ip': '127.0.0.1', 'port': 10001, 'device': 'sda1'}) + rb.rebalance() + + rb.add_dev({'id': 2, 'region': 1, 'zone': 0, 'weight': 0.25, + 'ip': '127.0.0.1', 'port': 10003, 'device': 'sda1'}) + rb.add_dev({'id': 3, 'region': 1, 'zone': 1, 'weight': 0.25, + 'ip': '127.0.0.1', 'port': 10004, 'device': 'sda1'}) + rb.pretend_min_part_hours_passed() + rb.rebalance(seed=2) + + # there's not enough room in r1 for every partition to have a replica + # in it, so only 86 assignments occur in r1 (that's ~1/5 of the total, + # since r1 has 1/5 of the weight). + population_by_region = self._get_population_by_region(rb) + self.assertEquals(population_by_region, {0: 682, 1: 86}) + + # and since there's not enough room, subsequent rebalances will not + # cause additional assignments to r1 + rb.pretend_min_part_hours_passed() + rb.rebalance(seed=2) + population_by_region = self._get_population_by_region(rb) + self.assertEquals(population_by_region, {0: 682, 1: 86}) + + # after you add more weight, more partition assignments move + rb.set_dev_weight(2, 0.5) + rb.set_dev_weight(3, 0.5) + rb.pretend_min_part_hours_passed() + rb.rebalance(seed=2) + population_by_region = self._get_population_by_region(rb) + self.assertEquals(population_by_region, {0: 614, 1: 154}) + + rb.set_dev_weight(2, 1.0) + rb.set_dev_weight(3, 1.0) + rb.pretend_min_part_hours_passed() + rb.rebalance(seed=2) + population_by_region = self._get_population_by_region(rb) + self.assertEquals(population_by_region, {0: 512, 1: 256}) + def test_set_replicas_increase(self): rb = ring.RingBuilder(8, 2, 0) rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 1, diff --git a/test/unit/common/ring/test_ring.py b/test/unit/common/ring/test_ring.py index 7f938f0cfe..fff715785b 100644 --- a/test/unit/common/ring/test_ring.py +++ b/test/unit/common/ring/test_ring.py @@ -703,7 +703,10 @@ class TestRing(TestRingBase): for region in xrange(1, 5): rb.add_dev({'id': next_dev_id, 'ip': '1.%d.1.%d' % (region, server), 'port': 1234, - 'zone': 1, 'region': region, 'weight': 1.0}) + # 108.0 is the weight of all devices created prior to + # this test in region 0; this way all regions have + # equal combined weight + 'zone': 1, 'region': region, 'weight': 108.0}) next_dev_id += 1 rb.pretend_min_part_hours_passed() rb.rebalance(seed=1)