Use just IP, not port, when determining partition placement

In the ring builder, we place partitions with maximum possible
dispersion across tiers, where a "tier" is region, then zone, then
IP/port,then device. Now, instead of IP/port, just use IP. The port
wasn't really getting us anything; two different object servers on two
different ports on one machine aren't separate failure
domains. However, if someone has only a few machines and is using one
object server on its own port per disk, then the ring builder would
end up with every disk in its own IP/port tier, resulting in bad (with
respect to durability) partition placement.

For example: assume 1 region, 1 zone, 4 machines, 48 total disks (12
per machine), and one object server (and hence one port) per
disk. With the old behavior, partition replicas will all go in the one
region, then the one zone, then pick one of 48 IP/port pairs, then
pick the one disk therein. This gives the same result as randomly
picking 3 disks (without replacement) to store data on; it completely
ignores machine boundaries.

With the new behavior, the replica placer will pick the one region,
then the one zone, then one of 4 IPs, then one of 12 disks
therein. This gives the optimal placement with respect to durability.

The same applies to Ring.get_more_nodes().

Co-Authored-By: Kota Tsuyuzaki <tsuyuzaki.kota@lab.ntt.co.jp>

Change-Id: Ibbd740c51296b7e360845b5309d276d7383a3742
This commit is contained in:
Samuel Merritt 2015-06-15 13:36:36 -07:00
parent 0e22371cd0
commit af72881d1d
5 changed files with 157 additions and 124 deletions

View File

@ -179,18 +179,17 @@ class Ring(object):
# doing it on every call to get_more_nodes().
regions = set()
zones = set()
ip_ports = set()
ips = set()
self._num_devs = 0
for dev in self._devs:
if dev:
regions.add(dev['region'])
zones.add((dev['region'], dev['zone']))
ip_ports.add((dev['region'], dev['zone'],
dev['ip'], dev['port']))
ips.add((dev['region'], dev['zone'], dev['ip']))
self._num_devs += 1
self._num_regions = len(regions)
self._num_zones = len(zones)
self._num_ip_ports = len(ip_ports)
self._num_ips = len(ips)
def _rebuild_tier_data(self):
self.tier2devs = defaultdict(list)
@ -329,8 +328,8 @@ class Ring(object):
used = set(d['id'] for d in primary_nodes)
same_regions = set(d['region'] for d in primary_nodes)
same_zones = set((d['region'], d['zone']) for d in primary_nodes)
same_ip_ports = set((d['region'], d['zone'], d['ip'], d['port'])
for d in primary_nodes)
same_ips = set(
(d['region'], d['zone'], d['ip']) for d in primary_nodes)
parts = len(self._replica2part2dev_id[0])
start = struct.unpack_from(
@ -356,9 +355,9 @@ class Ring(object):
used.add(dev_id)
same_regions.add(region)
zone = dev['zone']
ip_port = (region, zone, dev['ip'], dev['port'])
ip = (region, zone, dev['ip'])
same_zones.add((region, zone))
same_ip_ports.add(ip_port)
same_ips.add(ip)
if len(same_regions) == self._num_regions:
hit_all_regions = True
break
@ -380,17 +379,17 @@ class Ring(object):
yield dev
used.add(dev_id)
same_zones.add(zone)
ip_port = zone + (dev['ip'], dev['port'])
same_ip_ports.add(ip_port)
ip = zone + (dev['ip'],)
same_ips.add(ip)
if len(same_zones) == self._num_zones:
hit_all_zones = True
break
hit_all_ip_ports = len(same_ip_ports) == self._num_ip_ports
hit_all_ips = len(same_ips) == self._num_ips
for handoff_part in chain(xrange(start, parts, inc),
xrange(inc - ((parts - start) % inc),
start, inc)):
if hit_all_ip_ports:
if hit_all_ips:
# We've exhausted the pool of unused backends, so stop
# looking.
break
@ -398,14 +397,13 @@ class Ring(object):
if handoff_part < len(part2dev_id):
dev_id = part2dev_id[handoff_part]
dev = self._devs[dev_id]
ip_port = (dev['region'], dev['zone'],
dev['ip'], dev['port'])
if dev_id not in used and ip_port not in same_ip_ports:
ip = (dev['region'], dev['zone'], dev['ip'])
if dev_id not in used and ip not in same_ips:
yield dev
used.add(dev_id)
same_ip_ports.add(ip_port)
if len(same_ip_ports) == self._num_ip_ports:
hit_all_ip_ports = True
same_ips.add(ip)
if len(same_ips) == self._num_ips:
hit_all_ips = True
break
hit_all_devs = len(used) == self._num_devs

View File

@ -29,7 +29,7 @@ def tiers_for_dev(dev):
"""
t1 = dev['region']
t2 = dev['zone']
t3 = "{ip}:{port}".format(ip=dev.get('ip'), port=dev.get('port'))
t3 = dev['ip']
t4 = dev['id']
return ((t1,),
@ -48,40 +48,40 @@ def build_tier_tree(devices):
Example:
region 1 -+---- zone 1 -+---- 192.168.101.1:6000 -+---- device id 0
| | |
| | +---- device id 1
| | |
| | +---- device id 2
region 1 -+---- zone 1 -+---- 192.168.101.1 -+---- device id 0
| | |
| | +---- device id 1
| | |
| | +---- device id 2
| |
| +---- 192.168.101.2:6000 -+---- device id 3
| |
| +---- device id 4
| |
| +---- device id 5
| +---- 192.168.101.2 -+---- device id 3
| |
| +---- device id 4
| |
| +---- device id 5
|
+---- zone 2 -+---- 192.168.102.1:6000 -+---- device id 6
| |
| +---- device id 7
| |
| +---- device id 8
+---- zone 2 -+---- 192.168.102.1 -+---- device id 6
| |
| +---- device id 7
| |
| +---- device id 8
|
+---- 192.168.102.2:6000 -+---- device id 9
|
+---- device id 10
+---- 192.168.102.2 -+---- device id 9
|
+---- device id 10
region 2 -+---- zone 1 -+---- 192.168.201.1:6000 -+---- device id 12
| |
| +---- device id 13
| |
| +---- device id 14
region 2 -+---- zone 1 -+---- 192.168.201.1 -+---- device id 12
| |
| +---- device id 13
| |
| +---- device id 14
|
+---- 192.168.201.2:6000 -+---- device id 15
|
+---- device id 16
|
+---- device id 17
+---- 192.168.201.2 -+---- device id 15
|
+---- device id 16
|
+---- device id 17
The tier tree would look like:
{
@ -90,30 +90,30 @@ def build_tier_tree(devices):
(1,): [(1, 1), (1, 2)],
(2,): [(2, 1)],
(1, 1): [(1, 1, 192.168.101.1:6000),
(1, 1, 192.168.101.2:6000)],
(1, 2): [(1, 2, 192.168.102.1:6000),
(1, 2, 192.168.102.2:6000)],
(2, 1): [(2, 1, 192.168.201.1:6000),
(2, 1, 192.168.201.2:6000)],
(1, 1): [(1, 1, 192.168.101.1),
(1, 1, 192.168.101.2)],
(1, 2): [(1, 2, 192.168.102.1),
(1, 2, 192.168.102.2)],
(2, 1): [(2, 1, 192.168.201.1),
(2, 1, 192.168.201.2)],
(1, 1, 192.168.101.1:6000): [(1, 1, 192.168.101.1:6000, 0),
(1, 1, 192.168.101.1:6000, 1),
(1, 1, 192.168.101.1:6000, 2)],
(1, 1, 192.168.101.2:6000): [(1, 1, 192.168.101.2:6000, 3),
(1, 1, 192.168.101.2:6000, 4),
(1, 1, 192.168.101.2:6000, 5)],
(1, 2, 192.168.102.1:6000): [(1, 2, 192.168.102.1:6000, 6),
(1, 2, 192.168.102.1:6000, 7),
(1, 2, 192.168.102.1:6000, 8)],
(1, 2, 192.168.102.2:6000): [(1, 2, 192.168.102.2:6000, 9),
(1, 2, 192.168.102.2:6000, 10)],
(2, 1, 192.168.201.1:6000): [(2, 1, 192.168.201.1:6000, 12),
(2, 1, 192.168.201.1:6000, 13),
(2, 1, 192.168.201.1:6000, 14)],
(2, 1, 192.168.201.2:6000): [(2, 1, 192.168.201.2:6000, 15),
(2, 1, 192.168.201.2:6000, 16),
(2, 1, 192.168.201.2:6000, 17)],
(1, 1, 192.168.101.1): [(1, 1, 192.168.101.1, 0),
(1, 1, 192.168.101.1, 1),
(1, 1, 192.168.101.1, 2)],
(1, 1, 192.168.101.2): [(1, 1, 192.168.101.2, 3),
(1, 1, 192.168.101.2, 4),
(1, 1, 192.168.101.2, 5)],
(1, 2, 192.168.102.1): [(1, 2, 192.168.102.1, 6),
(1, 2, 192.168.102.1, 7),
(1, 2, 192.168.102.1, 8)],
(1, 2, 192.168.102.2): [(1, 2, 192.168.102.2, 9),
(1, 2, 192.168.102.2, 10)],
(2, 1, 192.168.201.1): [(2, 1, 192.168.201.1, 12),
(2, 1, 192.168.201.1, 13),
(2, 1, 192.168.201.1, 14)],
(2, 1, 192.168.201.2): [(2, 1, 192.168.201.2, 15),
(2, 1, 192.168.201.2, 16),
(2, 1, 192.168.201.2, 17)],
}
:devices: device dicts from which to generate the tree

View File

@ -1178,6 +1178,46 @@ class TestRingBuilder(unittest.TestCase):
9: 64,
})
def test_server_per_port(self):
# 3 servers, 3 disks each, with each disk on its own port
rb = ring.RingBuilder(8, 3, 1)
rb.add_dev({'id': 0, 'region': 0, 'region': 0, 'zone': 0, 'weight': 1,
'ip': '10.0.0.1', 'port': 10000, 'device': 'sdx'})
rb.add_dev({'id': 1, 'region': 0, 'region': 0, 'zone': 0, 'weight': 1,
'ip': '10.0.0.1', 'port': 10001, 'device': 'sdy'})
rb.add_dev({'id': 3, 'region': 0, 'region': 0, 'zone': 0, 'weight': 1,
'ip': '10.0.0.2', 'port': 10000, 'device': 'sdx'})
rb.add_dev({'id': 4, 'region': 0, 'region': 0, 'zone': 0, 'weight': 1,
'ip': '10.0.0.2', 'port': 10001, 'device': 'sdy'})
rb.add_dev({'id': 6, 'region': 0, 'region': 0, 'zone': 0, 'weight': 1,
'ip': '10.0.0.3', 'port': 10000, 'device': 'sdx'})
rb.add_dev({'id': 7, 'region': 0, 'region': 0, 'zone': 0, 'weight': 1,
'ip': '10.0.0.3', 'port': 10001, 'device': 'sdy'})
rb.rebalance(seed=1)
rb.add_dev({'id': 2, 'region': 0, 'region': 0, 'zone': 0, 'weight': 1,
'ip': '10.0.0.1', 'port': 10002, 'device': 'sdz'})
rb.add_dev({'id': 5, 'region': 0, 'region': 0, 'zone': 0, 'weight': 1,
'ip': '10.0.0.2', 'port': 10002, 'device': 'sdz'})
rb.add_dev({'id': 8, 'region': 0, 'region': 0, 'zone': 0, 'weight': 1,
'ip': '10.0.0.3', 'port': 10002, 'device': 'sdz'})
rb.pretend_min_part_hours_passed()
rb.rebalance(seed=1)
poorly_dispersed = []
for part in range(rb.parts):
on_nodes = set()
for replica in range(rb.replicas):
dev_id = rb._replica2part2dev[replica][part]
on_nodes.add(rb.devs[dev_id]['ip'])
if len(on_nodes) < rb.replicas:
poorly_dispersed.append(part)
self.assertEqual(poorly_dispersed, [])
def test_load(self):
rb = ring.RingBuilder(8, 3, 1)
devs = [{'id': 0, 'region': 0, 'zone': 0, 'weight': 1,
@ -1503,9 +1543,9 @@ class TestRingBuilder(unittest.TestCase):
self.assertEqual(rb._dispersion_graph, {
(0,): [0, 0, 0, 256],
(0, 0): [0, 0, 0, 256],
(0, 0, '127.0.0.1:10000'): [0, 0, 0, 256],
(0, 0, '127.0.0.1:10000', 0): [0, 128, 128, 0],
(0, 0, '127.0.0.1:10000', 1): [0, 128, 128, 0],
(0, 0, '127.0.0.1'): [0, 0, 0, 256],
(0, 0, '127.0.0.1', 0): [0, 128, 128, 0],
(0, 0, '127.0.0.1', 1): [0, 128, 128, 0],
})
def test_dispersion_with_zero_weight_devices_with_parts(self):
@ -1522,10 +1562,10 @@ class TestRingBuilder(unittest.TestCase):
self.assertEqual(rb._dispersion_graph, {
(0,): [0, 0, 0, 256],
(0, 0): [0, 0, 0, 256],
(0, 0, '127.0.0.1:10000'): [0, 0, 0, 256],
(0, 0, '127.0.0.1:10000', 0): [0, 256, 0, 0],
(0, 0, '127.0.0.1:10000', 1): [0, 256, 0, 0],
(0, 0, '127.0.0.1:10000', 2): [0, 256, 0, 0],
(0, 0, '127.0.0.1'): [0, 0, 0, 256],
(0, 0, '127.0.0.1', 0): [0, 256, 0, 0],
(0, 0, '127.0.0.1', 1): [0, 256, 0, 0],
(0, 0, '127.0.0.1', 2): [0, 256, 0, 0],
})
# now mark a device 2 for decom
rb.set_dev_weight(2, 0.0)
@ -1536,10 +1576,10 @@ class TestRingBuilder(unittest.TestCase):
self.assertEqual(rb._dispersion_graph, {
(0,): [0, 0, 0, 256],
(0, 0): [0, 0, 0, 256],
(0, 0, '127.0.0.1:10000'): [0, 0, 0, 256],
(0, 0, '127.0.0.1:10000', 0): [0, 256, 0, 0],
(0, 0, '127.0.0.1:10000', 1): [0, 256, 0, 0],
(0, 0, '127.0.0.1:10000', 2): [0, 256, 0, 0],
(0, 0, '127.0.0.1'): [0, 0, 0, 256],
(0, 0, '127.0.0.1', 0): [0, 256, 0, 0],
(0, 0, '127.0.0.1', 1): [0, 256, 0, 0],
(0, 0, '127.0.0.1', 2): [0, 256, 0, 0],
})
rb.pretend_min_part_hours_passed()
rb.rebalance(seed=3)
@ -1547,9 +1587,9 @@ class TestRingBuilder(unittest.TestCase):
self.assertEqual(rb._dispersion_graph, {
(0,): [0, 0, 0, 256],
(0, 0): [0, 0, 0, 256],
(0, 0, '127.0.0.1:10000'): [0, 0, 0, 256],
(0, 0, '127.0.0.1:10000', 0): [0, 128, 128, 0],
(0, 0, '127.0.0.1:10000', 1): [0, 128, 128, 0],
(0, 0, '127.0.0.1'): [0, 0, 0, 256],
(0, 0, '127.0.0.1', 0): [0, 128, 128, 0],
(0, 0, '127.0.0.1', 1): [0, 128, 128, 0],
})

View File

@ -480,7 +480,8 @@ class TestRing(TestRingBase):
for device in xrange(1, 4):
rb.add_dev({'id': next_dev_id,
'ip': '1.2.%d.%d' % (zone, server),
'port': 1234, 'zone': zone, 'region': 0,
'port': 1234 + device,
'zone': zone, 'region': 0,
'weight': 1.0})
next_dev_id += 1
rb.rebalance(seed=1)

View File

@ -70,8 +70,8 @@ class TestUtils(unittest.TestCase):
tiers_for_dev(self.test_dev),
((1,),
(1, 1),
(1, 1, '192.168.1.1:6000'),
(1, 1, '192.168.1.1:6000', 0)))
(1, 1, '192.168.1.1'),
(1, 1, '192.168.1.1', 0)))
def test_build_tier_tree(self):
ret = build_tier_tree(self.test_devs)
@ -79,27 +79,27 @@ class TestUtils(unittest.TestCase):
self.assertEqual(ret[()], set([(1,)]))
self.assertEqual(ret[(1,)], set([(1, 1), (1, 2)]))
self.assertEqual(ret[(1, 1)],
set([(1, 1, '192.168.1.2:6000'),
(1, 1, '192.168.1.1:6000')]))
set([(1, 1, '192.168.1.2'),
(1, 1, '192.168.1.1')]))
self.assertEqual(ret[(1, 2)],
set([(1, 2, '192.168.2.2:6000'),
(1, 2, '192.168.2.1:6000')]))
self.assertEqual(ret[(1, 1, '192.168.1.1:6000')],
set([(1, 1, '192.168.1.1:6000', 0),
(1, 1, '192.168.1.1:6000', 1),
(1, 1, '192.168.1.1:6000', 2)]))
self.assertEqual(ret[(1, 1, '192.168.1.2:6000')],
set([(1, 1, '192.168.1.2:6000', 3),
(1, 1, '192.168.1.2:6000', 4),
(1, 1, '192.168.1.2:6000', 5)]))
self.assertEqual(ret[(1, 2, '192.168.2.1:6000')],
set([(1, 2, '192.168.2.1:6000', 6),
(1, 2, '192.168.2.1:6000', 7),
(1, 2, '192.168.2.1:6000', 8)]))
self.assertEqual(ret[(1, 2, '192.168.2.2:6000')],
set([(1, 2, '192.168.2.2:6000', 9),
(1, 2, '192.168.2.2:6000', 10),
(1, 2, '192.168.2.2:6000', 11)]))
set([(1, 2, '192.168.2.2'),
(1, 2, '192.168.2.1')]))
self.assertEqual(ret[(1, 1, '192.168.1.1')],
set([(1, 1, '192.168.1.1', 0),
(1, 1, '192.168.1.1', 1),
(1, 1, '192.168.1.1', 2)]))
self.assertEqual(ret[(1, 1, '192.168.1.2')],
set([(1, 1, '192.168.1.2', 3),
(1, 1, '192.168.1.2', 4),
(1, 1, '192.168.1.2', 5)]))
self.assertEqual(ret[(1, 2, '192.168.2.1')],
set([(1, 2, '192.168.2.1', 6),
(1, 2, '192.168.2.1', 7),
(1, 2, '192.168.2.1', 8)]))
self.assertEqual(ret[(1, 2, '192.168.2.2')],
set([(1, 2, '192.168.2.2', 9),
(1, 2, '192.168.2.2', 10),
(1, 2, '192.168.2.2', 11)]))
def test_is_valid_ip(self):
self.assertTrue(is_valid_ip("127.0.0.1"))
@ -623,11 +623,11 @@ class TestUtils(unittest.TestCase):
def test_dispersion_report(self):
rb = ring.RingBuilder(8, 3, 0)
rb.add_dev({'id': 0, 'region': 1, 'zone': 0, 'weight': 100,
'ip': '127.0.0.1', 'port': 10000, 'device': 'sda1'})
'ip': '127.0.0.0', 'port': 10000, 'device': 'sda1'})
rb.add_dev({'id': 1, 'region': 1, 'zone': 1, 'weight': 200,
'ip': '127.0.0.1', 'port': 10001, 'device': 'sda1'})
rb.add_dev({'id': 2, 'region': 1, 'zone': 1, 'weight': 200,
'ip': '127.0.0.1', 'port': 10002, 'device': 'sda1'})
'ip': '127.0.0.2', 'port': 10002, 'device': 'sda1'})
rb.rebalance(seed=10)
self.assertEqual(rb.dispersion, 39.84375)
@ -635,16 +635,6 @@ class TestUtils(unittest.TestCase):
self.assertEqual(report['worst_tier'], 'r1z1')
self.assertEqual(report['max_dispersion'], 39.84375)
# Each node should store 256 partitions to avoid multiple replicas
# 2/5 of total weight * 768 ~= 307 -> 51 partitions on each node in
# zone 1 are stored at least twice on the nodes
expected = [
['r1z1', 2, '0', '154', '102'],
['r1z1-127.0.0.1:10001', 1, '205', '51', '0'],
['r1z1-127.0.0.1:10001/sda1', 1, '205', '51', '0'],
['r1z1-127.0.0.1:10002', 1, '205', '51', '0'],
['r1z1-127.0.0.1:10002/sda1', 1, '205', '51', '0']]
def build_tier_report(max_replicas, placed_parts, dispersion,
replicas):
return {
@ -653,16 +643,20 @@ class TestUtils(unittest.TestCase):
'dispersion': dispersion,
'replicas': replicas,
}
# Each node should store 256 partitions to avoid multiple replicas
# 2/5 of total weight * 768 ~= 307 -> 51 partitions on each node in
# zone 1 are stored at least twice on the nodes
expected = [
['r1z1', build_tier_report(
2, 256, 39.84375, [0, 0, 154, 102])],
['r1z1-127.0.0.1:10001', build_tier_report(
['r1z1-127.0.0.1', build_tier_report(
1, 256, 19.921875, [0, 205, 51, 0])],
['r1z1-127.0.0.1:10001/sda1', build_tier_report(
['r1z1-127.0.0.1/sda1', build_tier_report(
1, 256, 19.921875, [0, 205, 51, 0])],
['r1z1-127.0.0.1:10002', build_tier_report(
['r1z1-127.0.0.2', build_tier_report(
1, 256, 19.921875, [0, 205, 51, 0])],
['r1z1-127.0.0.1:10002/sda1', build_tier_report(
['r1z1-127.0.0.2/sda1', build_tier_report(
1, 256, 19.921875, [0, 205, 51, 0])],
]
report = dispersion_report(rb, 'r1z1.*', verbose=True)
@ -678,7 +672,7 @@ class TestUtils(unittest.TestCase):
report = dispersion_report(rb)
self.assertEqual(rb.dispersion, 40.234375)
self.assertEqual(report['worst_tier'], 'r1z0-127.0.0.1:10003')
self.assertEqual(report['worst_tier'], 'r1z0-127.0.0.1')
self.assertEqual(report['max_dispersion'], 30.078125)