From a8bd2f737cd053db5cd9fb0926344acc1f68e2ce Mon Sep 17 00:00:00 2001 From: Clay Gerrard Date: Tue, 30 Dec 2014 00:22:44 -0800 Subject: [PATCH] Add dispersion command to swift-ring-builder Output a dispersion report that shows how many parts have each replica count at each tier along with some additional context. Also the max_dispersion is a good canary for what a reasonable overload might be. Also display a warning on rebalance if the ring's dispersion is sub-optimal. The primitive form of the dispersion graph is cached on the builder, but the dispersion command will build it on the fly if you have a ring that was last rebalanced before the change. Also add --force option to rebalance to make it write a ring even if less than 1% of parts moved. Try to clarify some dispersion and balance a little bit in the ring section of the architectural overview. Co-Authored-By: Christian Schwede Co-Authored-By: Darrell Bishop Change-Id: I7696df25d092fac56588080722e0a4167ed2c824 --- doc/source/overview_architecture.rst | 40 ++++++--- swift/cli/ringbuilder.py | 124 ++++++++++++++++++++++++--- swift/common/ring/builder.py | 94 +++++++++++++++++--- swift/common/ring/utils.py | 51 +++++++++++ test/unit/cli/test_ringbuilder.py | 17 +++- test/unit/common/ring/test_utils.py | 64 +++++++++++++- 6 files changed, 355 insertions(+), 35 deletions(-) diff --git a/doc/source/overview_architecture.rst b/doc/source/overview_architecture.rst index ec884e1d4a..b8c9a32f75 100644 --- a/doc/source/overview_architecture.rst +++ b/doc/source/overview_architecture.rst @@ -37,19 +37,37 @@ cluster, and the locations for a partition are stored in the mapping maintained by the ring. The ring is also responsible for determining which devices are used for handoff in failure scenarios. -Data can be isolated with the concept of zones in the ring. Each replica -of a partition is guaranteed to reside in a different zone. A zone could -represent a drive, a server, a cabinet, a switch, or even a datacenter. +The replicas of each partition will be isolated onto as many distinct regions, +zones, servers and devices as the capacity of these failure domains allow. If +there are less failure domains at a given tier than replicas of the partition +assigned within a tier (e.g. a 3 replica cluster with 2 servers), or the +available capacity across the failure domains within a tier are not well +balanced it will not be possible to achieve both even capacity distribution +(`balance`) as well as complete isolation of replicas across failure domains +(`dispersion`). When this occurs the ring management tools will display a +warning so that the operator can evaluate the cluster topology. -The partitions of the ring are equally divided among all the devices in the -Swift installation. When partitions need to be moved around (for example if a -device is added to the cluster), the ring ensures that a minimum number of -partitions are moved at a time, and only one replica of a partition is moved at -a time. +Data is evenly distributed across the capacity available in the cluster as +described by the devices weight. Weights can be used to balance the +distribution of partitions on drives across the cluster. This can be useful, +for example, when different sized drives are used in a cluster. Device +weights can also be used when adding or removing capacity or failure domains +to control how many partitions are reassigned during a rebalance to be moved +as soon as replication bandwidth allows. -Weights can be used to balance the distribution of partitions on drives -across the cluster. This can be useful, for example, when different sized -drives are used in a cluster. +.. note:: + Prior to Swift 2.1.0 it was not possible to restrict partition movement by + device weight when adding new failure domains, and would allow extremely + unbalanced rings. The greedy dispersion algorithm is now subject to the + constraints of the physical capacity in the system, but can be adjusted + with-in reason via the overload option. Artificially unbalancing the + partition assignment without respect to capacity can introduce unexpected + full devices when a given failure domain does not physically support its + share of the used capacity in the tier. + +When partitions need to be moved around (for example if a device is added to +the cluster), the ring ensures that a minimum number of partitions are moved +at a time, and only one replica of a partition is moved at a time. The ring is used by the Proxy server and several background processes (like replication). diff --git a/swift/cli/ringbuilder.py b/swift/cli/ringbuilder.py index 83d466f64f..2d2177b9f7 100755 --- a/swift/cli/ringbuilder.py +++ b/swift/cli/ringbuilder.py @@ -21,13 +21,16 @@ from os.path import basename, abspath, dirname, exists, join as pathjoin from sys import argv as sys_argv, exit, stderr from textwrap import wrap from time import time +import optparse +import math from swift.common import exceptions from swift.common.ring import RingBuilder, Ring from swift.common.ring.builder import MAX_BALANCE from swift.common.utils import lock_parent_directory from swift.common.ring.utils import parse_search_value, parse_args, \ - build_dev_from_opts, parse_builder_ring_filename_args, find_parts + build_dev_from_opts, parse_builder_ring_filename_args, find_parts, \ + dispersion_report MAJOR_VERSION = 1 MINOR_VERSION = 3 @@ -246,9 +249,9 @@ swift-ring-builder if dev is not None]) balance = builder.get_balance() print '%d partitions, %.6f replicas, %d regions, %d zones, ' \ - '%d devices, %.02f balance' % (builder.parts, builder.replicas, - regions, zones, dev_count, - balance) + '%d devices, %.02f balance, %.02f dispersion' % ( + builder.parts, builder.replicas, regions, zones, dev_count, + balance, builder.dispersion) print 'The minimum number of hours before a partition can be ' \ 'reassigned is %s' % builder.min_part_hours print 'The overload factor is %.6f' % builder.overload @@ -600,13 +603,23 @@ swift-ring-builder remove [search-value ...] def rebalance(): """ -swift-ring-builder rebalance +swift-ring-builder rebalance [options] Attempts to rebalance the ring by reassigning partitions that haven't been recently reassigned. """ + usage = Commands.rebalance.__doc__.strip() + parser = optparse.OptionParser(usage) + parser.add_option('-f', '--force', action='store_true', + help='Force a rebalanced ring to save even ' + 'if < 1% of parts changed') + parser.add_option('-s', '--seed', help="seed to use for rebalance") + options, args = parser.parse_args(argv) + def get_seed(index): + if options.seed: + return options.seed try: - return argv[index] + return args[index] except IndexError: pass @@ -632,7 +645,8 @@ swift-ring-builder rebalance # special value(MAX_BALANCE) until zero weighted device return all # its partitions. So we cannot check balance has changed. # Thus we need to check balance or last_balance is special value. - if not devs_changed and abs(last_balance - balance) < 1 and \ + if not options.force and \ + not devs_changed and abs(last_balance - balance) < 1 and \ not (last_balance == MAX_BALANCE and balance == MAX_BALANCE): print 'Cowardly refusing to save rebalance as it did not change ' \ 'at least 1%.' @@ -648,10 +662,23 @@ swift-ring-builder rebalance ) print '-' * 79 exit(EXIT_ERROR) - print 'Reassigned %d (%.02f%%) partitions. Balance is now %.02f.' % \ - (parts, 100.0 * parts / builder.parts, balance) + print ('Reassigned %d (%.02f%%) partitions. ' + 'Balance is now %.02f. ' + 'Dispersion is now %.02f' % ( + parts, 100.0 * parts / builder.parts, + balance, + builder.dispersion)) status = EXIT_SUCCESS - if balance > 5 and balance / 100.0 > builder.overload: + if builder.dispersion > 0: + print '-' * 79 + print('NOTE: Dispersion of %.06f indicates some parts are not\n' + ' optimally dispersed.\n\n' + ' You may want adjust some device weights, increase\n' + ' the overload or review the dispersion report.' % + builder.dispersion) + status = EXIT_WARNING + print '-' * 79 + elif balance > 5 and balance / 100.0 > builder.overload: print '-' * 79 print 'NOTE: Balance of %.02f indicates you should push this ' % \ balance @@ -667,6 +694,83 @@ swift-ring-builder rebalance builder.save(argv[1]) exit(status) + def dispersion(): + """ +swift-ring-builder dispersion [options] + + Output report on dispersion. + + --verbose option will display dispersion graph broken down by tier + + You can filter which tiers are evaluated to drill down using a regex + in the optional search_filter arguemnt. + + The reports columns are: + + Tier : the name of the tier + parts : the total number of partitions with assignment in the tier + % : the percentage of parts in the tier with replicas over assigned + max : maximum replicas a part should have assigned at the tier + 0 - N : the number of parts with that many replicas assigned + + e.g. + Tier: parts % max 0 1 2 3 + r1z1 1022 79.45 1 2 210 784 28 + + r1z1 has 1022 total parts assigned, 79% of them have more than the + recommend max replica count of 1 assigned. Only 2 parts in the ring + are *not* assigned in this tier (0 replica count), 210 parts have + the recommend replica count of 1, 784 have 2 replicas, and 28 sadly + have all three replicas in this tier. + """ + status = EXIT_SUCCESS + if not builder._replica2part2dev: + print('Specified builder file \"%s\" is not rebalanced yet. ' + 'Please rebalance first.' % argv[1]) + exit(EXIT_ERROR) + usage = Commands.dispersion.__doc__.strip() + parser = optparse.OptionParser(usage) + parser.add_option('-v', '--verbose', action='store_true', + help='Display dispersion report for tiers') + options, args = parser.parse_args(argv) + if args[3:]: + search_filter = args[3] + else: + search_filter = None + report = dispersion_report(builder, search_filter=search_filter, + verbose=options.verbose) + print 'Dispersion is %.06f' % builder.dispersion + if report['worst_tier']: + status = EXIT_WARNING + print 'Worst tier is %.06f (%s)' % (report['max_dispersion'], + report['worst_tier']) + if report['graph']: + replica_range = range(int(math.ceil(builder.replicas + 1))) + part_count_width = '%%%ds' % max(len(str(builder.parts)), 5) + replica_counts_tmpl = ' '.join(part_count_width for i in + replica_range) + tiers = (tier for tier, _junk in report['graph']) + tier_width = max(max(map(len, tiers)), 30) + header_line = ('%-' + str(tier_width) + + 's ' + part_count_width + + ' %6s %6s ' + replica_counts_tmpl) % tuple( + ['Tier', 'Parts', '%', 'Max'] + replica_range) + underline = '-' * len(header_line) + print(underline) + print(header_line) + print(underline) + for tier_name, dispersion in report['graph']: + replica_counts_repr = replica_counts_tmpl % tuple( + dispersion['replicas']) + print ('%-' + str(tier_width) + 's ' + part_count_width + + ' %6.02f %6d %s') % (tier_name, + dispersion['placed_parts'], + dispersion['dispersion'], + dispersion['max_replicas'], + replica_counts_repr, + ) + exit(status) + def validate(): """ swift-ring-builder validate diff --git a/swift/common/ring/builder.py b/swift/common/ring/builder.py index 063a3e0e4d..f2c04eba3c 100644 --- a/swift/common/ring/builder.py +++ b/swift/common/ring/builder.py @@ -89,6 +89,9 @@ class RingBuilder(object): self._last_part_moves = None self._last_part_gather_start = 0 + + self._dispersion_graph = {} + self.dispersion = 0.0 self._remove_devs = [] self._ring = None @@ -143,6 +146,8 @@ class RingBuilder(object): self._last_part_moves_epoch = builder['_last_part_moves_epoch'] self._last_part_moves = builder['_last_part_moves'] self._last_part_gather_start = builder['_last_part_gather_start'] + self._dispersion_graph = builder.get('_dispersion_graph', {}) + self.dispersion = builder.get('dispersion', 0.0) self._remove_devs = builder['_remove_devs'] self._ring = None @@ -170,6 +175,8 @@ class RingBuilder(object): '_last_part_moves_epoch': self._last_part_moves_epoch, '_last_part_moves': self._last_part_moves, '_last_part_gather_start': self._last_part_gather_start, + '_dispersion_graph': self._dispersion_graph, + 'dispersion': self.dispersion, '_remove_devs': self._remove_devs} def change_min_part_hours(self, min_part_hours): @@ -348,6 +355,7 @@ class RingBuilder(object): if self._last_part_moves_epoch is None: self._initial_balance() self.devs_changed = False + self._build_dispersion_graph() return self.parts, self.get_balance() changed_parts = 0 self._update_last_part_moves() @@ -371,12 +379,62 @@ class RingBuilder(object): self.devs_changed = False self.version += 1 + changed_parts = self._build_dispersion_graph(old_replica2part2dev) + return changed_parts, balance + + def _build_dispersion_graph(self, old_replica2part2dev=None): + """ + Build a dict of all tiers in the cluster to a list of the number of + parts with a replica count at each index. The values of the dict will + be lists of length the maximum whole replica + 1 so that the + graph[tier][3] is the number of parts with in the tier with 3 replicas + and graph [tier][0] is the number of parts not assigned in this tier. + + i.e. + { + : [ + , + , + ... + , + ], + ... + } + + :param old_replica2part2dev: if called from rebalance, the + old_replica2part2dev can be used to count moved moved parts. + + :returns: number of parts with different assignments than + old_replica2part2dev if provided + """ + + # Since we're going to loop over every replica of every part we'll + # also count up changed_parts if old_replica2part2dev is passed in + old_replica2part2dev = old_replica2part2dev or [] # Compare the partition allocation before and after the rebalance # Only changed device ids are taken into account; devices might be # "touched" during the rebalance, but actually not really moved changed_parts = 0 - for rep_id, _rep in enumerate(self._replica2part2dev): - for part_id, new_device in enumerate(_rep): + + int_replicas = int(math.ceil(self.replicas)) + max_allowed_replicas = self._build_max_replicas_by_tier() + parts_at_risk = 0 + + tfd = {} + + dispersion_graph = {} + # go over all the devices holding each replica part by part + for part_id, dev_ids in enumerate( + itertools.izip(*self._replica2part2dev)): + # count the number of replicas of this part for each tier of each + # device, some devices may have overlapping tiers! + replicas_at_tier = defaultdict(int) + for rep_id, dev in enumerate(iter( + self.devs[dev_id] for dev_id in dev_ids)): + if dev['id'] not in tfd: + tfd[dev['id']] = tiers_for_dev(dev) + for tier in tfd[dev['id']]: + replicas_at_tier[tier] += 1 # IndexErrors will be raised if the replicas are increased or # decreased, and that actually means the partition has changed try: @@ -385,9 +443,25 @@ class RingBuilder(object): changed_parts += 1 continue - if old_device != new_device: + if old_device != dev['id']: changed_parts += 1 - return changed_parts, balance + part_at_risk = False + # update running totals for each tiers' number of parts with a + # given replica count + for tier, replicas in replicas_at_tier.items(): + if tier not in dispersion_graph: + dispersion_graph[tier] = [self.parts] + [0] * int_replicas + dispersion_graph[tier][0] -= 1 + dispersion_graph[tier][replicas] += 1 + if replicas > max_allowed_replicas[tier]: + part_at_risk = True + # this part may be at risk in multiple tiers, but we only count it + # as at_risk once + if part_at_risk: + parts_at_risk += 1 + self._dispersion_graph = dispersion_graph + self.dispersion = 100.0 * parts_at_risk / self.parts + return changed_parts def validate(self, stats=False): """ @@ -978,11 +1052,11 @@ class RingBuilder(object): if candidates_with_room: if len(candidates_with_room) > \ len(candidates_with_replicas): - # There exists at least one tier with room for - # another partition and 0 other replicas already in - # it, so we can use a faster search. The else - # branch's search would work here, but it's - # significantly slower. + # There exists at least one tier with room for + # another partition and 0 other replicas already + # in it, so we can use a faster search. The else + # branch's search would work here, but it's + # significantly slower. roomiest_tier = max( (t for t in candidates_with_room if other_replicas[t] == 0), @@ -1183,7 +1257,7 @@ class RingBuilder(object): builder = RingBuilder(1, 1, 1) builder.copy_from(builder_dict) for dev in builder.devs: - #really old rings didn't have meta keys + # really old rings didn't have meta keys if dev and 'meta' not in dev: dev['meta'] = '' # NOTE(akscram): An old ring builder file don't contain diff --git a/swift/common/ring/utils.py b/swift/common/ring/utils.py index f93326cc17..2ca2134605 100644 --- a/swift/common/ring/utils.py +++ b/swift/common/ring/utils.py @@ -15,6 +15,7 @@ from collections import defaultdict from operator import itemgetter import optparse +import re def tiers_for_dev(dev): @@ -331,3 +332,53 @@ def find_parts(builder, argv): partition_count.iteritems(), key=itemgetter(1), reverse=True) return sorted_partition_count + + +def dispersion_report(builder, search_filter=None, verbose=False): + if not builder._dispersion_graph: + builder._build_dispersion_graph() + max_allowed_replicas = builder._build_max_replicas_by_tier() + worst_tier = None + max_dispersion = 0.0 + sorted_graph = [] + for tier, replica_counts in sorted(builder._dispersion_graph.items()): + tier_name = get_tier_name(tier, builder) + if search_filter and not re.match(search_filter, tier_name): + continue + max_replicas = int(max_allowed_replicas[tier]) + at_risk_parts = sum(replica_counts[max_replicas + 1:]) + placed_parts = sum(replica_counts[1:]) + tier_dispersion = 100.0 * at_risk_parts / placed_parts + if tier_dispersion > max_dispersion: + max_dispersion = tier_dispersion + worst_tier = tier_name + max_dispersion = max(max_dispersion, tier_dispersion) + if not verbose: + continue + + tier_report = { + 'max_replicas': max_replicas, + 'placed_parts': placed_parts, + 'dispersion': tier_dispersion, + 'replicas': replica_counts, + } + sorted_graph.append((tier_name, tier_report)) + + return { + 'max_dispersion': max_dispersion, + 'worst_tier': worst_tier, + 'graph': sorted_graph, + } + + +def get_tier_name(tier, builder): + if len(tier) == 1: + return "r%s" % (tier[0], ) + if len(tier) == 2: + return "r%sz%s" % (tier[0], tier[1]) + if len(tier) == 3: + return "r%sz%s-%s" % (tier[0], tier[1], tier[2]) + if len(tier) == 4: + device = builder.devs[tier[3]] or {} + return "r%sz%s-%s/%s" % (tier[0], tier[1], tier[2], + device.get('device', 'IDd%s' % tier[3])) diff --git a/test/unit/cli/test_ringbuilder.py b/test/unit/cli/test_ringbuilder.py index 6c449a8204..2fb231f9d6 100644 --- a/test/unit/cli/test_ringbuilder.py +++ b/test/unit/cli/test_ringbuilder.py @@ -286,15 +286,26 @@ class TestCommands(unittest.TestCase): self.assertEquals(e.code, 2) def test_validate_generic_error(self): - with mock.patch.object(RingBuilder, 'load', - mock.Mock(side_effect= - IOError('Generic error occurred'))): + with mock.patch.object( + RingBuilder, 'load', mock.Mock( + side_effect=IOError('Generic error occurred'))): argv = ["", self.tmpfile, "validate"] try: swift.cli.ringbuilder.main(argv) except SystemExit as e: self.assertEquals(e.code, 2) + def test_warn_at_risk(self): + self.create_sample_ring() + ring = RingBuilder.load(self.tmpfile) + ring.devs[0]['weight'] = 10 + ring.save(self.tmpfile) + argv = ["", self.tmpfile, "rebalance"] + try: + swift.cli.ringbuilder.main(argv) + except SystemExit as e: + self.assertEquals(e.code, 1) + class TestRebalanceCommand(unittest.TestCase): diff --git a/test/unit/common/ring/test_utils.py b/test/unit/common/ring/test_utils.py index 2a9cbfcf59..0a7cae2828 100644 --- a/test/unit/common/ring/test_utils.py +++ b/test/unit/common/ring/test_utils.py @@ -19,7 +19,8 @@ from swift.common import ring from swift.common.ring.utils import (build_tier_tree, tiers_for_dev, parse_search_value, parse_args, build_dev_from_opts, find_parts, - parse_builder_ring_filename_args) + parse_builder_ring_filename_args, + dispersion_report) class TestUtils(unittest.TestCase): @@ -188,6 +189,67 @@ class TestUtils(unittest.TestCase): 3, count, "Partition %d has only %d replicas" % (partition, count)) + def test_dispersion_report(self): + rb = ring.RingBuilder(8, 3, 0) + rb.add_dev({'id': 0, 'region': 1, 'zone': 0, 'weight': 100, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda1'}) + rb.add_dev({'id': 1, 'region': 1, 'zone': 1, 'weight': 200, + 'ip': '127.0.0.1', 'port': 10001, 'device': 'sda1'}) + rb.add_dev({'id': 2, 'region': 1, 'zone': 1, 'weight': 200, + 'ip': '127.0.0.1', 'port': 10002, 'device': 'sda1'}) + rb.rebalance(seed=10) + + self.assertEqual(rb.dispersion, 39.84375) + report = dispersion_report(rb) + self.assertEqual(report['worst_tier'], 'r1z1') + self.assertEqual(report['max_dispersion'], 39.84375) + + # Each node should store 256 partitions to avoid multiple replicas + # 2/5 of total weight * 768 ~= 307 -> 51 partitions on each node in + # zone 1 are stored at least twice on the nodes + expected = [ + ['r1z1', 2, '0', '154', '102'], + ['r1z1-127.0.0.1:10001', 1, '205', '51', '0'], + ['r1z1-127.0.0.1:10001/sda1', 1, '205', '51', '0'], + ['r1z1-127.0.0.1:10002', 1, '205', '51', '0'], + ['r1z1-127.0.0.1:10002/sda1', 1, '205', '51', '0']] + + def build_tier_report(max_replicas, placed_parts, dispersion, + replicas): + return { + 'max_replicas': max_replicas, + 'placed_parts': placed_parts, + 'dispersion': dispersion, + 'replicas': replicas, + } + expected = [ + ['r1z1', build_tier_report( + 2, 256, 39.84375, [0, 0, 154, 102])], + ['r1z1-127.0.0.1:10001', build_tier_report( + 1, 256, 19.921875, [0, 205, 51, 0])], + ['r1z1-127.0.0.1:10001/sda1', build_tier_report( + 1, 256, 19.921875, [0, 205, 51, 0])], + ['r1z1-127.0.0.1:10002', build_tier_report( + 1, 256, 19.921875, [0, 205, 51, 0])], + ['r1z1-127.0.0.1:10002/sda1', build_tier_report( + 1, 256, 19.921875, [0, 205, 51, 0])], + ] + report = dispersion_report(rb, 'r1z1.*', verbose=True) + graph = report['graph'] + for i in range(len(expected)): + self.assertEqual(expected[i][0], graph[i][0]) + self.assertEqual(expected[i][1], graph[i][1]) + + # overcompensate in r1z0 + rb.add_dev({'id': 3, 'region': 1, 'zone': 0, 'weight': 500, + 'ip': '127.0.0.1', 'port': 10003, 'device': 'sda1'}) + rb.rebalance(seed=10) + + report = dispersion_report(rb) + self.assertEqual(rb.dispersion, 40.234375) + self.assertEqual(report['worst_tier'], 'r1z0-127.0.0.1:10003') + self.assertEqual(report['max_dispersion'], 30.078125) + if __name__ == '__main__': unittest.main()