diff --git a/swift/common/ring/builder.py b/swift/common/ring/builder.py index 43b9288a48..325f6f723c 100644 --- a/swift/common/ring/builder.py +++ b/swift/common/ring/builder.py @@ -646,23 +646,21 @@ class RingBuilder(object): def _get_available_parts(self): """ - Returns a tuple (wanted_parts_total, dict of (tier: available parts in - other tiers) for all tiers in the ring. + Returns a dict of (tier: available parts in other tiers) for all tiers + in the ring. Devices that have too much partitions (negative parts_wanted) are ignored, otherwise the sum of all parts_wanted is 0 +/- rounding errors. - """ - wanted_parts_total = 0 wanted_parts_for_tier = {} for dev in self._iter_devs(): - wanted_parts_total += max(0, dev['parts_wanted']) + pw = max(0, dev['parts_wanted']) for tier in tiers_for_dev(dev): if tier not in wanted_parts_for_tier: wanted_parts_for_tier[tier] = 0 - wanted_parts_for_tier[tier] += max(0, dev['parts_wanted']) - return (wanted_parts_total, wanted_parts_for_tier) + wanted_parts_for_tier[tier] += pw + return wanted_parts_for_tier def _gather_reassign_parts(self): """ @@ -674,6 +672,22 @@ class RingBuilder(object): # as a hot-spot). tfd = {} + tiers_by_len = defaultdict(set) + for dev in self._iter_devs(): + tiers = tiers_for_dev(dev) + tfd[dev['id']] = tiers + for tier in tiers: + tiers_by_len[len(tier)].add(tier) + + tiers_by_len = dict((length, list(tiers)) + for length, tiers in tiers_by_len.items()) + + sibling_tiers = {} + for length, tiers in tiers_by_len.items(): + for i, tier in enumerate(tiers): + sibling_tiers[tier] = [t for t in (tiers[:i] + tiers[(i + 1):]) + if t[:-1] == tier[:-1]] + # First we gather partitions from removed devices. Since removed # devices usually indicate device failures, we have no choice but to # reassign these partitions. However, we mark them as moved so later @@ -692,8 +706,7 @@ class RingBuilder(object): # currently sufficient spread out across the cluster. spread_out_parts = defaultdict(list) max_allowed_replicas = self._build_max_replicas_by_tier() - wanted_parts_total, wanted_parts_for_tier = \ - self._get_available_parts() + wanted_parts_for_tier = self._get_available_parts() moved_parts = 0 for part in xrange(self.parts): # Only move one replica at a time if possible. @@ -706,8 +719,6 @@ class RingBuilder(object): # revealed the lambda invocation as a significant cost. replicas_at_tier = {} for dev in self._devs_for_part(part): - if dev['id'] not in tfd: - tfd[dev['id']] = tiers_for_dev(dev) for tier in tfd[dev['id']]: if tier not in replicas_at_tier: replicas_at_tier[tier] = 1 @@ -719,16 +730,42 @@ class RingBuilder(object): for replica in self._replicas_for_part(part): dev = self.devs[self._replica2part2dev[replica][part]] removed_replica = False - if dev['id'] not in tfd: - tfd[dev['id']] = tiers_for_dev(dev) for tier in tfd[dev['id']]: - rep_at_tier = 0 - if tier in replicas_at_tier: - rep_at_tier = replicas_at_tier[tier] - # Only allowing parts to be gathered if - # there are wanted parts on other tiers - available_parts_for_tier = wanted_parts_total - \ - wanted_parts_for_tier[tier] - moved_parts + rep_at_tier = replicas_at_tier.get(tier, 0) + + # If this tier's not overcrowded, there's nothing to + # gather, so we can avoid some calculation here as an + # optimization. + if rep_at_tier <= max_allowed_replicas[tier]: + continue + + available_parts_for_tier = sum( + wanted_parts_for_tier[t] + for t in sibling_tiers[tier] + # If a sibling tier is "full" with respect to + # partition dispersion, but not "full" with respect + # to parts_wanted, we don't count it as a possible + # destination. + # + # Otherwise, we gather a partition from tier X + # (because its replicas are not spread out), and + # then we may place it right back in tier X or in + # another tier that already has replicas (because + # that tier has parts_wanted). Then, on the next + # rebalance, it'll happen again, and then again... + # + # Worse yet, this "dancing replica" immobilizes + # other replicas of the partition that want to move + # because they're on devices with negative + # parts_wanted. This can lead to a replica that + # sticks to a zero-weight device no matter how often + # the ring is rebalanced. + if (max_allowed_replicas[t] > + replicas_at_tier.get(t, 0)) + ) - moved_parts + + # Only allow a part to be gathered if there are wanted + # parts on other tiers. if (rep_at_tier > max_allowed_replicas[tier] and self._last_part_moves[part] >= self.min_part_hours and @@ -741,8 +778,6 @@ class RingBuilder(object): moved_parts += 1 break if removed_replica: - if dev['id'] not in tfd: - tfd[dev['id']] = tiers_for_dev(dev) for tier in tfd[dev['id']]: replicas_at_tier[tier] -= 1 diff --git a/test/unit/common/ring/test_builder.py b/test/unit/common/ring/test_builder.py index ef524f0a93..309abf032c 100644 --- a/test/unit/common/ring/test_builder.py +++ b/test/unit/common/ring/test_builder.py @@ -19,6 +19,7 @@ import operator import os import unittest import cPickle as pickle +from array import array from collections import defaultdict from math import ceil from tempfile import mkdtemp @@ -656,6 +657,58 @@ class TestRingBuilder(unittest.TestCase): rb.rebalance() + def test_remove_last_partition_from_zero_weight(self): + rb = ring.RingBuilder(4, 3, 1) + rb.add_dev({'id': 0, 'region': 0, 'zone': 1, 'weight': 1.0, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 2, 'weight': 2.0, + 'ip': '127.0.0.2', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 2, 'region': 0, 'zone': 3, 'weight': 3.0, + 'ip': '127.0.0.3', 'port': 10000, 'device': 'sda'}) + + rb.add_dev({'id': 3, 'region': 0, 'zone': 3, 'weight': 0.5, + 'ip': '127.0.0.3', 'port': 10001, 'device': 'zero'}) + + zero_weight_dev = 3 + + rb.rebalance() + + # We want at least one partition with replicas only in zone 2 and 3 + # due to device weights. It would *like* to spread out into zone 1, + # but can't, due to device weight. + # + # Also, we want such a partition to have a replica on device 3, + # which we will then reduce to zero weight. This should cause the + # removal of the replica from device 3. + # + # Getting this to happen by chance is hard, so let's just set up a + # builder so that it's in the state we want. This is a synthetic + # example; while the bug has happened on a real cluster, that + # builder file had a part_power of 16, so its contents are much too + # big to include here. + rb._replica2part2dev = [ + # these are the relevant ones + # | | | | + # v v v v + array('H', [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]), + array('H', [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]), + array('H', [0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 2, 2, 2, 2])] + + rb.set_dev_weight(zero_weight_dev, 0.0) + rb.pretend_min_part_hours_passed() + rb.rebalance(seed=1) + + node_counts = defaultdict(int) + for part2dev_id in rb._replica2part2dev: + for dev_id in part2dev_id: + node_counts[dev_id] += 1 + self.assertEqual(node_counts[zero_weight_dev], 0) + + # it's as balanced as it gets, so nothing moves anymore + rb.pretend_min_part_hours_passed() + parts_moved, _balance = rb.rebalance(seed=1) + self.assertEqual(parts_moved, 0) + def test_region_fullness_with_balanceable_ring(self): rb = ring.RingBuilder(8, 3, 1) rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 1,