Only move too-close-together replicas when they can spread out.

Imagine a 3-zone ring, and consider a partition in that ring with replicas placed as follows: * replica 0 is on device A (zone 2) * replica 1 is on device B (zone 1) * replica 2 is on device C (zone 2) Further, imagine that there are zero parts_wanted in all of zone 3; that is, zone 3 is completely full. However, zones 1 and 2 each have at least one parts_wanted on at least one device. When the ring builder goes to gather replicas to move, it gathers replica 0 because there are three zones available, but the replicas are only in two of them. Then, it places replica 0 in zone 1 or 2 somewhere because those are the only zones with parts_wanted. Notice that this does *not* do anything to spread the partition out better. Then, on the next rebalance, replica 0 gets picked up and moved (again) but doesn't improve its placement (again). If your builder has min_part_hours > 0 (and it should), then replicas 1 and 2 cannot move at all. A coworker observed the bug because a customer had such a partition, and its replica 2 was on a zero-weight device. He thought it odd that a zero-weight device should still have one partition on it despite the ring having been rebalanced dozens of times. Even if you don't have zero-weight devices, having a bunch of partitions trade places on each rebalance isn't particularly good. Note that this only happens with an unbalanceable ring; if the ring *can* balance, the gathered partitions will swap places, but they will get spread across more zones, so they won't get gathered up again on the next rebalance. Change-Id: I8f44f032caac25c44778a497dedf23f5cb61b6bb Closes-Bug: 1400083
2014-12-10 15:59:21 -08:00 · 2014-12-10 15:59:21 -08:00 · 1880351f1a
commit 1880351f1a
parent cc2f0f4ed6
2 changed files with 110 additions and 22 deletions
--- a/swift/common/ring/builder.py
+++ b/swift/common/ring/builder.py
@ -646,23 +646,21 @@ class RingBuilder(object):

    def _get_available_parts(self):
        """
-        Returns a tuple (wanted_parts_total, dict of (tier: available parts in
-        other tiers) for all tiers in the ring.
+        Returns a dict of (tier: available parts in other tiers) for all tiers
+        in the ring.

        Devices that have too much partitions (negative parts_wanted) are
        ignored, otherwise the sum of all parts_wanted is 0 +/- rounding
        errors.
-
        """
-        wanted_parts_total = 0
        wanted_parts_for_tier = {}
        for dev in self._iter_devs():
-            wanted_parts_total += max(0, dev['parts_wanted'])
+            pw = max(0, dev['parts_wanted'])
            for tier in tiers_for_dev(dev):
                if tier not in wanted_parts_for_tier:
                    wanted_parts_for_tier[tier] = 0
-                wanted_parts_for_tier[tier] += max(0, dev['parts_wanted'])
-        return (wanted_parts_total, wanted_parts_for_tier)
+                wanted_parts_for_tier[tier] += pw
+        return wanted_parts_for_tier

    def _gather_reassign_parts(self):
        """
@ -674,6 +672,22 @@ class RingBuilder(object):
        # as a hot-spot).
        tfd = {}

+        tiers_by_len = defaultdict(set)
+        for dev in self._iter_devs():
+            tiers = tiers_for_dev(dev)
+            tfd[dev['id']] = tiers
+            for tier in tiers:
+                tiers_by_len[len(tier)].add(tier)
+
+        tiers_by_len = dict((length, list(tiers))
+                            for length, tiers in tiers_by_len.items())
+
+        sibling_tiers = {}
+        for length, tiers in tiers_by_len.items():
+            for i, tier in enumerate(tiers):
+                sibling_tiers[tier] = [t for t in (tiers[:i] + tiers[(i + 1):])
+                                       if t[:-1] == tier[:-1]]
+
        # First we gather partitions from removed devices. Since removed
        # devices usually indicate device failures, we have no choice but to
        # reassign these partitions. However, we mark them as moved so later
@ -692,8 +706,7 @@ class RingBuilder(object):
        # currently sufficient spread out across the cluster.
        spread_out_parts = defaultdict(list)
        max_allowed_replicas = self._build_max_replicas_by_tier()
-        wanted_parts_total, wanted_parts_for_tier = \
-            self._get_available_parts()
+        wanted_parts_for_tier = self._get_available_parts()
        moved_parts = 0
        for part in xrange(self.parts):
            # Only move one replica at a time if possible.
@ -706,8 +719,6 @@ class RingBuilder(object):
            # revealed the lambda invocation as a significant cost.
            replicas_at_tier = {}
            for dev in self._devs_for_part(part):
-                if dev['id'] not in tfd:
-                    tfd[dev['id']] = tiers_for_dev(dev)
                for tier in tfd[dev['id']]:
                    if tier not in replicas_at_tier:
                        replicas_at_tier[tier] = 1
@ -719,16 +730,42 @@ class RingBuilder(object):
            for replica in self._replicas_for_part(part):
                dev = self.devs[self._replica2part2dev[replica][part]]
                removed_replica = False
-                if dev['id'] not in tfd:
-                    tfd[dev['id']] = tiers_for_dev(dev)
                for tier in tfd[dev['id']]:
-                    rep_at_tier = 0
-                    if tier in replicas_at_tier:
-                        rep_at_tier = replicas_at_tier[tier]
-                    # Only allowing parts to be gathered if
-                    # there are wanted parts on other tiers
-                    available_parts_for_tier = wanted_parts_total - \
-                        wanted_parts_for_tier[tier] - moved_parts
+                    rep_at_tier = replicas_at_tier.get(tier, 0)
+
+                    # If this tier's not overcrowded, there's nothing to
+                    # gather, so we can avoid some calculation here as an
+                    # optimization.
+                    if rep_at_tier <= max_allowed_replicas[tier]:
+                        continue
+
+                    available_parts_for_tier = sum(
+                        wanted_parts_for_tier[t]
+                        for t in sibling_tiers[tier]
+                        # If a sibling tier is "full" with respect to
+                        # partition dispersion, but not "full" with respect
+                        # to parts_wanted, we don't count it as a possible
+                        # destination.
+                        #
+                        # Otherwise, we gather a partition from tier X
+                        # (because its replicas are not spread out), and
+                        # then we may place it right back in tier X or in
+                        # another tier that already has replicas (because
+                        # that tier has parts_wanted). Then, on the next
+                        # rebalance, it'll happen again, and then again...
+                        #
+                        # Worse yet, this "dancing replica" immobilizes
+                        # other replicas of the partition that want to move
+                        # because they're on devices with negative
+                        # parts_wanted. This can lead to a replica that
+                        # sticks to a zero-weight device no matter how often
+                        # the ring is rebalanced.
+                        if (max_allowed_replicas[t] >
+                            replicas_at_tier.get(t, 0))
+                    ) - moved_parts
+
+                    # Only allow a part to be gathered if there are wanted
+                    # parts on other tiers.
                    if (rep_at_tier > max_allowed_replicas[tier] and
                            self._last_part_moves[part] >=
                            self.min_part_hours and
@ -741,8 +778,6 @@ class RingBuilder(object):
                        moved_parts += 1
                        break
                if removed_replica:
-                    if dev['id'] not in tfd:
-                        tfd[dev['id']] = tiers_for_dev(dev)
                    for tier in tfd[dev['id']]:
                        replicas_at_tier[tier] -= 1

--- a/test/unit/common/ring/test_builder.py
+++ b/test/unit/common/ring/test_builder.py
@ -19,6 +19,7 @@ import operator
 import os
 import unittest
 import cPickle as pickle
+from array import array
 from collections import defaultdict
 from math import ceil
 from tempfile import mkdtemp
@ -656,6 +657,58 @@ class TestRingBuilder(unittest.TestCase):

        rb.rebalance()

+    def test_remove_last_partition_from_zero_weight(self):
+        rb = ring.RingBuilder(4, 3, 1)
+        rb.add_dev({'id': 0, 'region': 0, 'zone': 1, 'weight': 1.0,
+                    'ip': '127.0.0.1', 'port': 10000, 'device': 'sda'})
+        rb.add_dev({'id': 1, 'region': 0, 'zone': 2, 'weight': 2.0,
+                    'ip': '127.0.0.2', 'port': 10000, 'device': 'sda'})
+        rb.add_dev({'id': 2, 'region': 0, 'zone': 3, 'weight': 3.0,
+                    'ip': '127.0.0.3', 'port': 10000, 'device': 'sda'})
+
+        rb.add_dev({'id': 3, 'region': 0, 'zone': 3, 'weight': 0.5,
+                    'ip': '127.0.0.3', 'port': 10001, 'device': 'zero'})
+
+        zero_weight_dev = 3
+
+        rb.rebalance()
+
+        # We want at least one partition with replicas only in zone 2 and 3
+        # due to device weights. It would *like* to spread out into zone 1,
+        # but can't, due to device weight.
+        #
+        # Also, we want such a partition to have a replica on device 3,
+        # which we will then reduce to zero weight. This should cause the
+        # removal of the replica from device 3.
+        #
+        # Getting this to happen by chance is hard, so let's just set up a
+        # builder so that it's in the state we want. This is a synthetic
+        # example; while the bug has happened on a real cluster, that
+        # builder file had a part_power of 16, so its contents are much too
+        # big to include here.
+        rb._replica2part2dev = [
+            #                            these are the relevant ones
+            #                                   |  |  |  |
+            #                                   v  v  v  v
+            array('H', [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]),
+            array('H', [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]),
+            array('H', [0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 2, 2, 2, 2])]
+
+        rb.set_dev_weight(zero_weight_dev, 0.0)
+        rb.pretend_min_part_hours_passed()
+        rb.rebalance(seed=1)
+
+        node_counts = defaultdict(int)
+        for part2dev_id in rb._replica2part2dev:
+            for dev_id in part2dev_id:
+                node_counts[dev_id] += 1
+        self.assertEqual(node_counts[zero_weight_dev], 0)
+
+        # it's as balanced as it gets, so nothing moves anymore
+        rb.pretend_min_part_hours_passed()
+        parts_moved, _balance = rb.rebalance(seed=1)
+        self.assertEqual(parts_moved, 0)
+
    def test_region_fullness_with_balanceable_ring(self):
        rb = ring.RingBuilder(8, 3, 1)
        rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 1,