Optimize the ring builder's _reassign_parts() method.

Another ring builder optimization. Profiling revealed hotspots in many calls to min() and list.sort() in _reassign_parts(). That method didn't get exercised in my last optimization pass because that pass targeted a rebalance where nothing really moved around. This time, I wrote a script which created a fresh ring, added a bunch of devices, did the initial balance, deleted some devices, balanced, and added some more back in. Results from homebrew Python 2.7.3 on OS X 10.8.2 Macbook Pro (bare-metal): BEFORE: Using part-power = 18, adding 600 devices, removing 100, then adding 300 more... NOT Profiling to 'initial_balance.prof' wall-time delta: 131.33s NOT Profiling to 'deleting_200_rebalance.prof' wall-time delta: 25.67s NOT Profiling to 'first_rebalance.prof' wall-time delta: 62.00s AFTER: Using part-power = 18, adding 600 devices, removing 100, then adding 300 more... NOT Profiling to 'initial_balance.prof' wall-time delta: 28.04s NOT Profiling to 'deleting_200_rebalance.prof' wall-time delta: 9.35s NOT Profiling to 'first_rebalance.prof' wall-time delta: 16.41s The driver script I used is available here: https://gist.github.com/adb982aec6f0709f1273 Change-Id: I17e270acb12b5e4d4bbb1e34d8867dea90678961
2013-01-15 08:32:07 -08:00 · 2013-01-15 08:32:07 -08:00 · ec084de189
commit ec084de189
parent 62e71a2b1f
1 changed files with 84 additions and 36 deletions
--- a/swift/common/ring/builder.py
+++ b/swift/common/ring/builder.py
@ -641,65 +641,97 @@ class RingBuilder(object):
            sorted((d for d in self._iter_devs() if d['weight']),
                   key=lambda x: x['sort_key'])

-        tier2children = build_tier_tree(available_devs)
-
        tier2devs = defaultdict(list)
        tier2sort_key = defaultdict(list)
-        tiers_by_depth = defaultdict(set)
+        max_tier_depth = 0
        for dev in available_devs:
            for tier in tiers_for_dev(dev):
                tier2devs[tier].append(dev)  # <-- starts out sorted!
                tier2sort_key[tier].append(dev['sort_key'])
-                tiers_by_depth[len(tier)].add(tier)
+                if len(tier) > max_tier_depth:
+                    max_tier_depth = len(tier)
+
+        tier2children_sets = build_tier_tree(available_devs)
+        tier2children = defaultdict(list)
+        tier2children_sort_key = {}
+        tiers_list = [()]
+        depth = 1
+        while depth <= max_tier_depth:
+            new_tiers_list = []
+            for tier in tiers_list:
+                child_tiers = list(tier2children_sets[tier])
+                child_tiers.sort(key=lambda t: tier2sort_key[t][-1])
+                tier2children[tier] = child_tiers
+                tier2children_sort_key[tier] = map(
+                    lambda t: tier2sort_key[t][-1], child_tiers)
+                new_tiers_list.extend(child_tiers)
+            tiers_list = new_tiers_list
+            depth += 1

        for part, replace_replicas in reassign_parts:
            # Gather up what other tiers (zones, ip_ports, and devices) the
            # replicas not-to-be-moved are in for this part.
-            other_replicas = defaultdict(lambda: 0)
+            other_replicas = defaultdict(int)
+            unique_tiers_by_tier_len = defaultdict(set)
            for replica in xrange(self.replicas):
                if replica not in replace_replicas:
                    dev = self.devs[self._replica2part2dev[replica][part]]
                    for tier in tiers_for_dev(dev):
                        other_replicas[tier] += 1
-
-            def find_home_for_replica(tier=(), depth=1):
-                # Order the tiers by how many replicas of this
-                # partition they already have. Then, of the ones
-                # with the smallest number of replicas, pick the
-                # tier with the hungriest drive and then continue
-                # searching in that subtree.
-                #
-                # There are other strategies we could use here,
-                # such as hungriest-tier (i.e. biggest
-                # sum-of-parts-wanted) or picking one at random.
-                # However, hungriest-drive is what was used here
-                # before, and it worked pretty well in practice.
-                #
-                # Note that this allocator will balance things as
-                # evenly as possible at each level of the device
-                # layout. If your layout is extremely unbalanced,
-                # this may produce poor results.
-                candidate_tiers = tier2children[tier]
-                min_count = min(other_replicas[t] for t in candidate_tiers)
-                candidate_tiers = [t for t in candidate_tiers
-                                   if other_replicas[t] == min_count]
-                candidate_tiers.sort(
-                    key=lambda t: tier2sort_key[t][-1])
-
-                if depth == max(tiers_by_depth.keys()):
-                    return tier2devs[candidate_tiers[-1]][-1]
-
-                return find_home_for_replica(tier=candidate_tiers[-1],
-                                             depth=depth + 1)
+                        unique_tiers_by_tier_len[len(tier)].add(tier)

            for replica in replace_replicas:
-                dev = find_home_for_replica()
+                tier = ()
+                depth = 1
+                while depth <= max_tier_depth:
+                    # Order the tiers by how many replicas of this
+                    # partition they already have. Then, of the ones
+                    # with the smallest number of replicas, pick the
+                    # tier with the hungriest drive and then continue
+                    # searching in that subtree.
+                    #
+                    # There are other strategies we could use here,
+                    # such as hungriest-tier (i.e. biggest
+                    # sum-of-parts-wanted) or picking one at random.
+                    # However, hungriest-drive is what was used here
+                    # before, and it worked pretty well in practice.
+                    #
+                    # Note that this allocator will balance things as
+                    # evenly as possible at each level of the device
+                    # layout. If your layout is extremely unbalanced,
+                    # this may produce poor results.
+                    #
+                    # This used to be a cute, recursive function, but it's been
+                    # unrolled for performance.
+                    candidate_tiers = tier2children[tier]
+                    candidates_with_replicas = \
+                        unique_tiers_by_tier_len[len(tier) + 1]
+                    if len(candidate_tiers) > len(candidates_with_replicas):
+                        # There exists at least one tier with 0 other replicas,
+                        # so work backward among the candidates, accepting the
+                        # first which isn't in other_replicas.
+                        #
+                        # This optimization is to avoid calling the min()
+                        # below, which is expensive if you've got thousands of
+                        # drives.
+                        for t in reversed(candidate_tiers):
+                            if other_replicas[t] == 0:
+                                tier = t
+                                break
+                    else:
+                        min_count = min(other_replicas[t]
+                                        for t in candidate_tiers)
+                        tier = (t for t in reversed(candidate_tiers)
+                                if other_replicas[t] == min_count).next()
+                    depth += 1
+                dev = tier2devs[tier][-1]
                dev['parts_wanted'] -= 1
                dev['parts'] += 1
                old_sort_key = dev['sort_key']
                new_sort_key = dev['sort_key'] = self._sort_key_for(dev)
                for tier in tiers_for_dev(dev):
                    other_replicas[tier] += 1
+                    unique_tiers_by_tier_len[len(tier)].add(tier)

                    index = bisect.bisect_left(tier2sort_key[tier],
                                               old_sort_key)
@ -711,6 +743,22 @@ class RingBuilder(object):
                    tier2devs[tier].insert(new_index, dev)
                    tier2sort_key[tier].insert(new_index, new_sort_key)

+                    # Now jiggle tier2children values to keep them sorted
+                    new_last_sort_key = tier2sort_key[tier][-1]
+                    parent_tier = tier[0:-1]
+                    index = bisect.bisect_left(
+                        tier2children_sort_key[parent_tier],
+                        old_sort_key)
+                    popped = tier2children[parent_tier].pop(index)
+                    tier2children_sort_key[parent_tier].pop(index)
+
+                    new_index = bisect.bisect_left(
+                        tier2children_sort_key[parent_tier],
+                        new_last_sort_key)
+                    tier2children[parent_tier].insert(new_index, popped)
+                    tier2children_sort_key[parent_tier].insert(
+                        new_index, new_last_sort_key)
+
                self._replica2part2dev[replica][part] = dev['id']

        # Just to save memory and keep from accidental reuse.