From af72881d1dcc46486d8d652b92a0ec9aa9ca2bfa Mon Sep 17 00:00:00 2001 From: Samuel Merritt Date: Mon, 15 Jun 2015 13:36:36 -0700 Subject: [PATCH] Use just IP, not port, when determining partition placement In the ring builder, we place partitions with maximum possible dispersion across tiers, where a "tier" is region, then zone, then IP/port,then device. Now, instead of IP/port, just use IP. The port wasn't really getting us anything; two different object servers on two different ports on one machine aren't separate failure domains. However, if someone has only a few machines and is using one object server on its own port per disk, then the ring builder would end up with every disk in its own IP/port tier, resulting in bad (with respect to durability) partition placement. For example: assume 1 region, 1 zone, 4 machines, 48 total disks (12 per machine), and one object server (and hence one port) per disk. With the old behavior, partition replicas will all go in the one region, then the one zone, then pick one of 48 IP/port pairs, then pick the one disk therein. This gives the same result as randomly picking 3 disks (without replacement) to store data on; it completely ignores machine boundaries. With the new behavior, the replica placer will pick the one region, then the one zone, then one of 4 IPs, then one of 12 disks therein. This gives the optimal placement with respect to durability. The same applies to Ring.get_more_nodes(). Co-Authored-By: Kota Tsuyuzaki Change-Id: Ibbd740c51296b7e360845b5309d276d7383a3742 --- swift/common/ring/ring.py | 34 ++++----- swift/common/ring/utils.py | 104 +++++++++++++------------- test/unit/common/ring/test_builder.py | 68 +++++++++++++---- test/unit/common/ring/test_ring.py | 3 +- test/unit/common/ring/test_utils.py | 72 ++++++++---------- 5 files changed, 157 insertions(+), 124 deletions(-) diff --git a/swift/common/ring/ring.py b/swift/common/ring/ring.py index 62e19951d3..d4feaa8e23 100644 --- a/swift/common/ring/ring.py +++ b/swift/common/ring/ring.py @@ -179,18 +179,17 @@ class Ring(object): # doing it on every call to get_more_nodes(). regions = set() zones = set() - ip_ports = set() + ips = set() self._num_devs = 0 for dev in self._devs: if dev: regions.add(dev['region']) zones.add((dev['region'], dev['zone'])) - ip_ports.add((dev['region'], dev['zone'], - dev['ip'], dev['port'])) + ips.add((dev['region'], dev['zone'], dev['ip'])) self._num_devs += 1 self._num_regions = len(regions) self._num_zones = len(zones) - self._num_ip_ports = len(ip_ports) + self._num_ips = len(ips) def _rebuild_tier_data(self): self.tier2devs = defaultdict(list) @@ -329,8 +328,8 @@ class Ring(object): used = set(d['id'] for d in primary_nodes) same_regions = set(d['region'] for d in primary_nodes) same_zones = set((d['region'], d['zone']) for d in primary_nodes) - same_ip_ports = set((d['region'], d['zone'], d['ip'], d['port']) - for d in primary_nodes) + same_ips = set( + (d['region'], d['zone'], d['ip']) for d in primary_nodes) parts = len(self._replica2part2dev_id[0]) start = struct.unpack_from( @@ -356,9 +355,9 @@ class Ring(object): used.add(dev_id) same_regions.add(region) zone = dev['zone'] - ip_port = (region, zone, dev['ip'], dev['port']) + ip = (region, zone, dev['ip']) same_zones.add((region, zone)) - same_ip_ports.add(ip_port) + same_ips.add(ip) if len(same_regions) == self._num_regions: hit_all_regions = True break @@ -380,17 +379,17 @@ class Ring(object): yield dev used.add(dev_id) same_zones.add(zone) - ip_port = zone + (dev['ip'], dev['port']) - same_ip_ports.add(ip_port) + ip = zone + (dev['ip'],) + same_ips.add(ip) if len(same_zones) == self._num_zones: hit_all_zones = True break - hit_all_ip_ports = len(same_ip_ports) == self._num_ip_ports + hit_all_ips = len(same_ips) == self._num_ips for handoff_part in chain(xrange(start, parts, inc), xrange(inc - ((parts - start) % inc), start, inc)): - if hit_all_ip_ports: + if hit_all_ips: # We've exhausted the pool of unused backends, so stop # looking. break @@ -398,14 +397,13 @@ class Ring(object): if handoff_part < len(part2dev_id): dev_id = part2dev_id[handoff_part] dev = self._devs[dev_id] - ip_port = (dev['region'], dev['zone'], - dev['ip'], dev['port']) - if dev_id not in used and ip_port not in same_ip_ports: + ip = (dev['region'], dev['zone'], dev['ip']) + if dev_id not in used and ip not in same_ips: yield dev used.add(dev_id) - same_ip_ports.add(ip_port) - if len(same_ip_ports) == self._num_ip_ports: - hit_all_ip_ports = True + same_ips.add(ip) + if len(same_ips) == self._num_ips: + hit_all_ips = True break hit_all_devs = len(used) == self._num_devs diff --git a/swift/common/ring/utils.py b/swift/common/ring/utils.py index b00ef825d4..4fcee2eb24 100644 --- a/swift/common/ring/utils.py +++ b/swift/common/ring/utils.py @@ -29,7 +29,7 @@ def tiers_for_dev(dev): """ t1 = dev['region'] t2 = dev['zone'] - t3 = "{ip}:{port}".format(ip=dev.get('ip'), port=dev.get('port')) + t3 = dev['ip'] t4 = dev['id'] return ((t1,), @@ -48,40 +48,40 @@ def build_tier_tree(devices): Example: - region 1 -+---- zone 1 -+---- 192.168.101.1:6000 -+---- device id 0 - | | | - | | +---- device id 1 - | | | - | | +---- device id 2 + region 1 -+---- zone 1 -+---- 192.168.101.1 -+---- device id 0 + | | | + | | +---- device id 1 + | | | + | | +---- device id 2 | | - | +---- 192.168.101.2:6000 -+---- device id 3 - | | - | +---- device id 4 - | | - | +---- device id 5 + | +---- 192.168.101.2 -+---- device id 3 + | | + | +---- device id 4 + | | + | +---- device id 5 | - +---- zone 2 -+---- 192.168.102.1:6000 -+---- device id 6 - | | - | +---- device id 7 - | | - | +---- device id 8 + +---- zone 2 -+---- 192.168.102.1 -+---- device id 6 + | | + | +---- device id 7 + | | + | +---- device id 8 | - +---- 192.168.102.2:6000 -+---- device id 9 - | - +---- device id 10 + +---- 192.168.102.2 -+---- device id 9 + | + +---- device id 10 - region 2 -+---- zone 1 -+---- 192.168.201.1:6000 -+---- device id 12 - | | - | +---- device id 13 - | | - | +---- device id 14 + region 2 -+---- zone 1 -+---- 192.168.201.1 -+---- device id 12 + | | + | +---- device id 13 + | | + | +---- device id 14 | - +---- 192.168.201.2:6000 -+---- device id 15 - | - +---- device id 16 - | - +---- device id 17 + +---- 192.168.201.2 -+---- device id 15 + | + +---- device id 16 + | + +---- device id 17 The tier tree would look like: { @@ -90,30 +90,30 @@ def build_tier_tree(devices): (1,): [(1, 1), (1, 2)], (2,): [(2, 1)], - (1, 1): [(1, 1, 192.168.101.1:6000), - (1, 1, 192.168.101.2:6000)], - (1, 2): [(1, 2, 192.168.102.1:6000), - (1, 2, 192.168.102.2:6000)], - (2, 1): [(2, 1, 192.168.201.1:6000), - (2, 1, 192.168.201.2:6000)], + (1, 1): [(1, 1, 192.168.101.1), + (1, 1, 192.168.101.2)], + (1, 2): [(1, 2, 192.168.102.1), + (1, 2, 192.168.102.2)], + (2, 1): [(2, 1, 192.168.201.1), + (2, 1, 192.168.201.2)], - (1, 1, 192.168.101.1:6000): [(1, 1, 192.168.101.1:6000, 0), - (1, 1, 192.168.101.1:6000, 1), - (1, 1, 192.168.101.1:6000, 2)], - (1, 1, 192.168.101.2:6000): [(1, 1, 192.168.101.2:6000, 3), - (1, 1, 192.168.101.2:6000, 4), - (1, 1, 192.168.101.2:6000, 5)], - (1, 2, 192.168.102.1:6000): [(1, 2, 192.168.102.1:6000, 6), - (1, 2, 192.168.102.1:6000, 7), - (1, 2, 192.168.102.1:6000, 8)], - (1, 2, 192.168.102.2:6000): [(1, 2, 192.168.102.2:6000, 9), - (1, 2, 192.168.102.2:6000, 10)], - (2, 1, 192.168.201.1:6000): [(2, 1, 192.168.201.1:6000, 12), - (2, 1, 192.168.201.1:6000, 13), - (2, 1, 192.168.201.1:6000, 14)], - (2, 1, 192.168.201.2:6000): [(2, 1, 192.168.201.2:6000, 15), - (2, 1, 192.168.201.2:6000, 16), - (2, 1, 192.168.201.2:6000, 17)], + (1, 1, 192.168.101.1): [(1, 1, 192.168.101.1, 0), + (1, 1, 192.168.101.1, 1), + (1, 1, 192.168.101.1, 2)], + (1, 1, 192.168.101.2): [(1, 1, 192.168.101.2, 3), + (1, 1, 192.168.101.2, 4), + (1, 1, 192.168.101.2, 5)], + (1, 2, 192.168.102.1): [(1, 2, 192.168.102.1, 6), + (1, 2, 192.168.102.1, 7), + (1, 2, 192.168.102.1, 8)], + (1, 2, 192.168.102.2): [(1, 2, 192.168.102.2, 9), + (1, 2, 192.168.102.2, 10)], + (2, 1, 192.168.201.1): [(2, 1, 192.168.201.1, 12), + (2, 1, 192.168.201.1, 13), + (2, 1, 192.168.201.1, 14)], + (2, 1, 192.168.201.2): [(2, 1, 192.168.201.2, 15), + (2, 1, 192.168.201.2, 16), + (2, 1, 192.168.201.2, 17)], } :devices: device dicts from which to generate the tree diff --git a/test/unit/common/ring/test_builder.py b/test/unit/common/ring/test_builder.py index a05823368c..f1840b8210 100644 --- a/test/unit/common/ring/test_builder.py +++ b/test/unit/common/ring/test_builder.py @@ -1178,6 +1178,46 @@ class TestRingBuilder(unittest.TestCase): 9: 64, }) + def test_server_per_port(self): + # 3 servers, 3 disks each, with each disk on its own port + rb = ring.RingBuilder(8, 3, 1) + rb.add_dev({'id': 0, 'region': 0, 'region': 0, 'zone': 0, 'weight': 1, + 'ip': '10.0.0.1', 'port': 10000, 'device': 'sdx'}) + rb.add_dev({'id': 1, 'region': 0, 'region': 0, 'zone': 0, 'weight': 1, + 'ip': '10.0.0.1', 'port': 10001, 'device': 'sdy'}) + + rb.add_dev({'id': 3, 'region': 0, 'region': 0, 'zone': 0, 'weight': 1, + 'ip': '10.0.0.2', 'port': 10000, 'device': 'sdx'}) + rb.add_dev({'id': 4, 'region': 0, 'region': 0, 'zone': 0, 'weight': 1, + 'ip': '10.0.0.2', 'port': 10001, 'device': 'sdy'}) + + rb.add_dev({'id': 6, 'region': 0, 'region': 0, 'zone': 0, 'weight': 1, + 'ip': '10.0.0.3', 'port': 10000, 'device': 'sdx'}) + rb.add_dev({'id': 7, 'region': 0, 'region': 0, 'zone': 0, 'weight': 1, + 'ip': '10.0.0.3', 'port': 10001, 'device': 'sdy'}) + + rb.rebalance(seed=1) + + rb.add_dev({'id': 2, 'region': 0, 'region': 0, 'zone': 0, 'weight': 1, + 'ip': '10.0.0.1', 'port': 10002, 'device': 'sdz'}) + rb.add_dev({'id': 5, 'region': 0, 'region': 0, 'zone': 0, 'weight': 1, + 'ip': '10.0.0.2', 'port': 10002, 'device': 'sdz'}) + rb.add_dev({'id': 8, 'region': 0, 'region': 0, 'zone': 0, 'weight': 1, + 'ip': '10.0.0.3', 'port': 10002, 'device': 'sdz'}) + + rb.pretend_min_part_hours_passed() + rb.rebalance(seed=1) + + poorly_dispersed = [] + for part in range(rb.parts): + on_nodes = set() + for replica in range(rb.replicas): + dev_id = rb._replica2part2dev[replica][part] + on_nodes.add(rb.devs[dev_id]['ip']) + if len(on_nodes) < rb.replicas: + poorly_dispersed.append(part) + self.assertEqual(poorly_dispersed, []) + def test_load(self): rb = ring.RingBuilder(8, 3, 1) devs = [{'id': 0, 'region': 0, 'zone': 0, 'weight': 1, @@ -1503,9 +1543,9 @@ class TestRingBuilder(unittest.TestCase): self.assertEqual(rb._dispersion_graph, { (0,): [0, 0, 0, 256], (0, 0): [0, 0, 0, 256], - (0, 0, '127.0.0.1:10000'): [0, 0, 0, 256], - (0, 0, '127.0.0.1:10000', 0): [0, 128, 128, 0], - (0, 0, '127.0.0.1:10000', 1): [0, 128, 128, 0], + (0, 0, '127.0.0.1'): [0, 0, 0, 256], + (0, 0, '127.0.0.1', 0): [0, 128, 128, 0], + (0, 0, '127.0.0.1', 1): [0, 128, 128, 0], }) def test_dispersion_with_zero_weight_devices_with_parts(self): @@ -1522,10 +1562,10 @@ class TestRingBuilder(unittest.TestCase): self.assertEqual(rb._dispersion_graph, { (0,): [0, 0, 0, 256], (0, 0): [0, 0, 0, 256], - (0, 0, '127.0.0.1:10000'): [0, 0, 0, 256], - (0, 0, '127.0.0.1:10000', 0): [0, 256, 0, 0], - (0, 0, '127.0.0.1:10000', 1): [0, 256, 0, 0], - (0, 0, '127.0.0.1:10000', 2): [0, 256, 0, 0], + (0, 0, '127.0.0.1'): [0, 0, 0, 256], + (0, 0, '127.0.0.1', 0): [0, 256, 0, 0], + (0, 0, '127.0.0.1', 1): [0, 256, 0, 0], + (0, 0, '127.0.0.1', 2): [0, 256, 0, 0], }) # now mark a device 2 for decom rb.set_dev_weight(2, 0.0) @@ -1536,10 +1576,10 @@ class TestRingBuilder(unittest.TestCase): self.assertEqual(rb._dispersion_graph, { (0,): [0, 0, 0, 256], (0, 0): [0, 0, 0, 256], - (0, 0, '127.0.0.1:10000'): [0, 0, 0, 256], - (0, 0, '127.0.0.1:10000', 0): [0, 256, 0, 0], - (0, 0, '127.0.0.1:10000', 1): [0, 256, 0, 0], - (0, 0, '127.0.0.1:10000', 2): [0, 256, 0, 0], + (0, 0, '127.0.0.1'): [0, 0, 0, 256], + (0, 0, '127.0.0.1', 0): [0, 256, 0, 0], + (0, 0, '127.0.0.1', 1): [0, 256, 0, 0], + (0, 0, '127.0.0.1', 2): [0, 256, 0, 0], }) rb.pretend_min_part_hours_passed() rb.rebalance(seed=3) @@ -1547,9 +1587,9 @@ class TestRingBuilder(unittest.TestCase): self.assertEqual(rb._dispersion_graph, { (0,): [0, 0, 0, 256], (0, 0): [0, 0, 0, 256], - (0, 0, '127.0.0.1:10000'): [0, 0, 0, 256], - (0, 0, '127.0.0.1:10000', 0): [0, 128, 128, 0], - (0, 0, '127.0.0.1:10000', 1): [0, 128, 128, 0], + (0, 0, '127.0.0.1'): [0, 0, 0, 256], + (0, 0, '127.0.0.1', 0): [0, 128, 128, 0], + (0, 0, '127.0.0.1', 1): [0, 128, 128, 0], }) diff --git a/test/unit/common/ring/test_ring.py b/test/unit/common/ring/test_ring.py index b97b60eeee..376eac674c 100644 --- a/test/unit/common/ring/test_ring.py +++ b/test/unit/common/ring/test_ring.py @@ -480,7 +480,8 @@ class TestRing(TestRingBase): for device in xrange(1, 4): rb.add_dev({'id': next_dev_id, 'ip': '1.2.%d.%d' % (zone, server), - 'port': 1234, 'zone': zone, 'region': 0, + 'port': 1234 + device, + 'zone': zone, 'region': 0, 'weight': 1.0}) next_dev_id += 1 rb.rebalance(seed=1) diff --git a/test/unit/common/ring/test_utils.py b/test/unit/common/ring/test_utils.py index bf59a33be1..8eaca09756 100644 --- a/test/unit/common/ring/test_utils.py +++ b/test/unit/common/ring/test_utils.py @@ -70,8 +70,8 @@ class TestUtils(unittest.TestCase): tiers_for_dev(self.test_dev), ((1,), (1, 1), - (1, 1, '192.168.1.1:6000'), - (1, 1, '192.168.1.1:6000', 0))) + (1, 1, '192.168.1.1'), + (1, 1, '192.168.1.1', 0))) def test_build_tier_tree(self): ret = build_tier_tree(self.test_devs) @@ -79,27 +79,27 @@ class TestUtils(unittest.TestCase): self.assertEqual(ret[()], set([(1,)])) self.assertEqual(ret[(1,)], set([(1, 1), (1, 2)])) self.assertEqual(ret[(1, 1)], - set([(1, 1, '192.168.1.2:6000'), - (1, 1, '192.168.1.1:6000')])) + set([(1, 1, '192.168.1.2'), + (1, 1, '192.168.1.1')])) self.assertEqual(ret[(1, 2)], - set([(1, 2, '192.168.2.2:6000'), - (1, 2, '192.168.2.1:6000')])) - self.assertEqual(ret[(1, 1, '192.168.1.1:6000')], - set([(1, 1, '192.168.1.1:6000', 0), - (1, 1, '192.168.1.1:6000', 1), - (1, 1, '192.168.1.1:6000', 2)])) - self.assertEqual(ret[(1, 1, '192.168.1.2:6000')], - set([(1, 1, '192.168.1.2:6000', 3), - (1, 1, '192.168.1.2:6000', 4), - (1, 1, '192.168.1.2:6000', 5)])) - self.assertEqual(ret[(1, 2, '192.168.2.1:6000')], - set([(1, 2, '192.168.2.1:6000', 6), - (1, 2, '192.168.2.1:6000', 7), - (1, 2, '192.168.2.1:6000', 8)])) - self.assertEqual(ret[(1, 2, '192.168.2.2:6000')], - set([(1, 2, '192.168.2.2:6000', 9), - (1, 2, '192.168.2.2:6000', 10), - (1, 2, '192.168.2.2:6000', 11)])) + set([(1, 2, '192.168.2.2'), + (1, 2, '192.168.2.1')])) + self.assertEqual(ret[(1, 1, '192.168.1.1')], + set([(1, 1, '192.168.1.1', 0), + (1, 1, '192.168.1.1', 1), + (1, 1, '192.168.1.1', 2)])) + self.assertEqual(ret[(1, 1, '192.168.1.2')], + set([(1, 1, '192.168.1.2', 3), + (1, 1, '192.168.1.2', 4), + (1, 1, '192.168.1.2', 5)])) + self.assertEqual(ret[(1, 2, '192.168.2.1')], + set([(1, 2, '192.168.2.1', 6), + (1, 2, '192.168.2.1', 7), + (1, 2, '192.168.2.1', 8)])) + self.assertEqual(ret[(1, 2, '192.168.2.2')], + set([(1, 2, '192.168.2.2', 9), + (1, 2, '192.168.2.2', 10), + (1, 2, '192.168.2.2', 11)])) def test_is_valid_ip(self): self.assertTrue(is_valid_ip("127.0.0.1")) @@ -623,11 +623,11 @@ class TestUtils(unittest.TestCase): def test_dispersion_report(self): rb = ring.RingBuilder(8, 3, 0) rb.add_dev({'id': 0, 'region': 1, 'zone': 0, 'weight': 100, - 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda1'}) + 'ip': '127.0.0.0', 'port': 10000, 'device': 'sda1'}) rb.add_dev({'id': 1, 'region': 1, 'zone': 1, 'weight': 200, 'ip': '127.0.0.1', 'port': 10001, 'device': 'sda1'}) rb.add_dev({'id': 2, 'region': 1, 'zone': 1, 'weight': 200, - 'ip': '127.0.0.1', 'port': 10002, 'device': 'sda1'}) + 'ip': '127.0.0.2', 'port': 10002, 'device': 'sda1'}) rb.rebalance(seed=10) self.assertEqual(rb.dispersion, 39.84375) @@ -635,16 +635,6 @@ class TestUtils(unittest.TestCase): self.assertEqual(report['worst_tier'], 'r1z1') self.assertEqual(report['max_dispersion'], 39.84375) - # Each node should store 256 partitions to avoid multiple replicas - # 2/5 of total weight * 768 ~= 307 -> 51 partitions on each node in - # zone 1 are stored at least twice on the nodes - expected = [ - ['r1z1', 2, '0', '154', '102'], - ['r1z1-127.0.0.1:10001', 1, '205', '51', '0'], - ['r1z1-127.0.0.1:10001/sda1', 1, '205', '51', '0'], - ['r1z1-127.0.0.1:10002', 1, '205', '51', '0'], - ['r1z1-127.0.0.1:10002/sda1', 1, '205', '51', '0']] - def build_tier_report(max_replicas, placed_parts, dispersion, replicas): return { @@ -653,16 +643,20 @@ class TestUtils(unittest.TestCase): 'dispersion': dispersion, 'replicas': replicas, } + + # Each node should store 256 partitions to avoid multiple replicas + # 2/5 of total weight * 768 ~= 307 -> 51 partitions on each node in + # zone 1 are stored at least twice on the nodes expected = [ ['r1z1', build_tier_report( 2, 256, 39.84375, [0, 0, 154, 102])], - ['r1z1-127.0.0.1:10001', build_tier_report( + ['r1z1-127.0.0.1', build_tier_report( 1, 256, 19.921875, [0, 205, 51, 0])], - ['r1z1-127.0.0.1:10001/sda1', build_tier_report( + ['r1z1-127.0.0.1/sda1', build_tier_report( 1, 256, 19.921875, [0, 205, 51, 0])], - ['r1z1-127.0.0.1:10002', build_tier_report( + ['r1z1-127.0.0.2', build_tier_report( 1, 256, 19.921875, [0, 205, 51, 0])], - ['r1z1-127.0.0.1:10002/sda1', build_tier_report( + ['r1z1-127.0.0.2/sda1', build_tier_report( 1, 256, 19.921875, [0, 205, 51, 0])], ] report = dispersion_report(rb, 'r1z1.*', verbose=True) @@ -678,7 +672,7 @@ class TestUtils(unittest.TestCase): report = dispersion_report(rb) self.assertEqual(rb.dispersion, 40.234375) - self.assertEqual(report['worst_tier'], 'r1z0-127.0.0.1:10003') + self.assertEqual(report['worst_tier'], 'r1z0-127.0.0.1') self.assertEqual(report['max_dispersion'], 30.078125)