Fixes a race condition in the hash ring code
The current hash ring code suffers from several problems: 1. The cache is reset on any get_topic_for call, which means that the cache is not used between API calls. Previously it was done to work around the situation when a new conductor is not visible until the API process restarts. Currently we refresh the hash rings periodically. This patch removes resetting the cache. To avoid waiting 2 minutes to be able to use a new driver: 1) the hash ring cache is always rebuilt when a ring is not found, 2) the hash_ring_reset_interval option was changed to 15 seconds. 2. The reset of the cache races with the hot path in the get_ring call. It is possible that the reset happens after the class-level cache variable is checked but before it is used, yielding None. This patch stores the value of the class-level variable to a local variable before checking, thus ensuring None is never returned. Finally, some logging was added to the modified code to make this kind of problems more debugable in the future. Change-Id: I6e18c6ec23a053b59c76fcadd52b13d84d81b4fb Story: #2003966 Task: #26896 Partial-Bug: #1792872
This commit is contained in:
parent
c80e912b3a
commit
5471157e4f
@ -16,6 +16,7 @@
|
|||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
from oslo_log import log
|
||||||
from tooz import hashring
|
from tooz import hashring
|
||||||
|
|
||||||
from ironic.common import exception
|
from ironic.common import exception
|
||||||
@ -24,6 +25,9 @@ from ironic.conf import CONF
|
|||||||
from ironic.db import api as dbapi
|
from ironic.db import api as dbapi
|
||||||
|
|
||||||
|
|
||||||
|
LOG = log.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class HashRingManager(object):
|
class HashRingManager(object):
|
||||||
_hash_rings = None
|
_hash_rings = None
|
||||||
_lock = threading.Lock()
|
_lock = threading.Lock()
|
||||||
@ -42,15 +46,20 @@ class HashRingManager(object):
|
|||||||
if not self.cache:
|
if not self.cache:
|
||||||
return self._load_hash_rings()
|
return self._load_hash_rings()
|
||||||
|
|
||||||
# Hot path, no lock
|
# Hot path, no lock. Using a local variable to avoid races with code
|
||||||
if self.__class__._hash_rings is not None and self.updated_at >= limit:
|
# changing the class variable.
|
||||||
return self.__class__._hash_rings
|
hash_rings = self.__class__._hash_rings
|
||||||
|
if hash_rings is not None and self.updated_at >= limit:
|
||||||
|
return hash_rings
|
||||||
|
|
||||||
with self._lock:
|
with self._lock:
|
||||||
if self.__class__._hash_rings is None or self.updated_at < limit:
|
if self.__class__._hash_rings is None or self.updated_at < limit:
|
||||||
|
LOG.debug('Rebuilding cached hash rings')
|
||||||
rings = self._load_hash_rings()
|
rings = self._load_hash_rings()
|
||||||
self.__class__._hash_rings = rings
|
self.__class__._hash_rings = rings
|
||||||
self.updated_at = time.time()
|
self.updated_at = time.time()
|
||||||
|
LOG.debug('Finished rebuilding hash rings, available drivers '
|
||||||
|
'are %s', ', '.join(rings))
|
||||||
return self.__class__._hash_rings
|
return self.__class__._hash_rings
|
||||||
|
|
||||||
def _load_hash_rings(self):
|
def _load_hash_rings(self):
|
||||||
@ -67,9 +76,28 @@ class HashRingManager(object):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def reset(cls):
|
def reset(cls):
|
||||||
with cls._lock:
|
with cls._lock:
|
||||||
|
LOG.debug('Resetting cached hash rings')
|
||||||
cls._hash_rings = None
|
cls._hash_rings = None
|
||||||
|
|
||||||
def get_ring(self, driver_name, conductor_group):
|
def get_ring(self, driver_name, conductor_group):
|
||||||
|
try:
|
||||||
|
return self._get_ring(driver_name, conductor_group)
|
||||||
|
except (exception.DriverNotFound, exception.TemporaryFailure):
|
||||||
|
# NOTE(dtantsur): we assume that this case is more often caused by
|
||||||
|
# conductors coming and leaving, so we try to rebuild the rings.
|
||||||
|
LOG.debug('No conductor from group %(group)s found for driver '
|
||||||
|
'%(driver)s, trying to rebuild the hash rings',
|
||||||
|
{'driver': driver_name,
|
||||||
|
'group': conductor_group or '<none>'})
|
||||||
|
|
||||||
|
self.__class__.reset()
|
||||||
|
return self._get_ring(driver_name, conductor_group)
|
||||||
|
|
||||||
|
def _get_ring(self, driver_name, conductor_group):
|
||||||
|
# There are no conductors, temporary failure - 503 Service Unavailable
|
||||||
|
if not self.ring:
|
||||||
|
raise exception.TemporaryFailure()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if self.use_groups:
|
if self.use_groups:
|
||||||
return self.ring['%s:%s' % (conductor_group, driver_name)]
|
return self.ring['%s:%s' % (conductor_group, driver_name)]
|
||||||
|
@ -131,12 +131,6 @@ class ConductorAPI(object):
|
|||||||
:raises: NoValidHost
|
:raises: NoValidHost
|
||||||
|
|
||||||
"""
|
"""
|
||||||
self.ring_manager.reset()
|
|
||||||
|
|
||||||
# There are no conductors, temporary failure - 503 Service Unavailable
|
|
||||||
if not self.ring_manager.ring:
|
|
||||||
raise exception.TemporaryFailure()
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
ring = self.ring_manager.get_ring(node.driver,
|
ring = self.ring_manager.get_ring(node.driver,
|
||||||
node.conductor_group)
|
node.conductor_group)
|
||||||
@ -166,7 +160,12 @@ class ConductorAPI(object):
|
|||||||
# does not take groups into account.
|
# does not take groups into account.
|
||||||
local_ring_manager = hash_ring.HashRingManager(use_groups=False,
|
local_ring_manager = hash_ring.HashRingManager(use_groups=False,
|
||||||
cache=False)
|
cache=False)
|
||||||
ring = local_ring_manager.get_ring(driver_name, '')
|
try:
|
||||||
|
ring = local_ring_manager.get_ring(driver_name, '')
|
||||||
|
except exception.TemporaryFailure:
|
||||||
|
# NOTE(dtantsur): even if no conductors are registered, it makes
|
||||||
|
# sense to report 404 on any driver request.
|
||||||
|
raise exception.DriverNotFound(_("No conductors registered."))
|
||||||
host = random.choice(list(ring.nodes))
|
host = random.choice(list(ring.nodes))
|
||||||
return self.topic + "." + host
|
return self.topic + "." + host
|
||||||
|
|
||||||
|
@ -192,8 +192,10 @@ hash_opts = [
|
|||||||
'and potentially allow the Ironic cluster to recover '
|
'and potentially allow the Ironic cluster to recover '
|
||||||
'more quickly if a conductor instance is terminated.')),
|
'more quickly if a conductor instance is terminated.')),
|
||||||
cfg.IntOpt('hash_ring_reset_interval',
|
cfg.IntOpt('hash_ring_reset_interval',
|
||||||
default=180,
|
default=15,
|
||||||
help=_('Interval (in seconds) between hash ring resets.')),
|
help=_('Time (in seconds) after which the hash ring is '
|
||||||
|
'considered outdated and is refreshed on the next '
|
||||||
|
'access.')),
|
||||||
]
|
]
|
||||||
|
|
||||||
image_opts = [
|
image_opts = [
|
||||||
|
@ -80,29 +80,42 @@ class HashRingManagerTestCase(db_base.DbTestCase):
|
|||||||
self.ring_manager.get_ring,
|
self.ring_manager.get_ring,
|
||||||
'driver3', '')
|
'driver3', '')
|
||||||
|
|
||||||
def test_hash_ring_manager_no_refresh(self):
|
def test_hash_ring_manager_automatic_retry(self):
|
||||||
# If a new conductor is registered after the ring manager is
|
self.assertRaises(exception.TemporaryFailure,
|
||||||
# initialized, it won't be seen. Long term this is probably
|
|
||||||
# undesirable, but today is the intended behavior.
|
|
||||||
self.assertRaises(exception.DriverNotFound,
|
|
||||||
self.ring_manager.get_ring,
|
self.ring_manager.get_ring,
|
||||||
'hardware-type', '')
|
'hardware-type', '')
|
||||||
self.register_conductors()
|
self.register_conductors()
|
||||||
self.assertRaises(exception.DriverNotFound,
|
|
||||||
self.ring_manager.get_ring,
|
|
||||||
'hardware-type', '')
|
|
||||||
|
|
||||||
def test_hash_ring_manager_refresh(self):
|
|
||||||
CONF.set_override('hash_ring_reset_interval', 30)
|
|
||||||
# Initialize the ring manager to make _hash_rings not None, then
|
|
||||||
# hash ring will refresh only when time interval exceeded.
|
|
||||||
self.assertRaises(exception.DriverNotFound,
|
|
||||||
self.ring_manager.get_ring,
|
|
||||||
'hardware-type', '')
|
|
||||||
self.register_conductors()
|
|
||||||
self.ring_manager.updated_at = time.time() - 31
|
|
||||||
self.ring_manager.get_ring('hardware-type', '')
|
self.ring_manager.get_ring('hardware-type', '')
|
||||||
|
|
||||||
|
def test_hash_ring_manager_reset_interval(self):
|
||||||
|
CONF.set_override('hash_ring_reset_interval', 30)
|
||||||
|
# Just to simplify calculations
|
||||||
|
CONF.set_override('hash_partition_exponent', 0)
|
||||||
|
c1 = self.dbapi.register_conductor({
|
||||||
|
'hostname': 'host1',
|
||||||
|
'drivers': ['driver1', 'driver2'],
|
||||||
|
})
|
||||||
|
c2 = self.dbapi.register_conductor({
|
||||||
|
'hostname': 'host2',
|
||||||
|
'drivers': ['driver1'],
|
||||||
|
})
|
||||||
|
self.dbapi.register_conductor_hardware_interfaces(
|
||||||
|
c1.id, 'hardware-type', 'deploy', ['iscsi', 'direct'], 'iscsi')
|
||||||
|
|
||||||
|
ring = self.ring_manager.get_ring('hardware-type', '')
|
||||||
|
self.assertEqual(1, len(ring))
|
||||||
|
|
||||||
|
self.dbapi.register_conductor_hardware_interfaces(
|
||||||
|
c2.id, 'hardware-type', 'deploy', ['iscsi', 'direct'], 'iscsi')
|
||||||
|
ring = self.ring_manager.get_ring('hardware-type', '')
|
||||||
|
# The new conductor is not known yet. Automatic retry does not kick in,
|
||||||
|
# since there is an active conductor for the requested hardware type.
|
||||||
|
self.assertEqual(1, len(ring))
|
||||||
|
|
||||||
|
self.ring_manager.updated_at = time.time() - 31
|
||||||
|
ring = self.ring_manager.get_ring('hardware-type', '')
|
||||||
|
self.assertEqual(2, len(ring))
|
||||||
|
|
||||||
def test_hash_ring_manager_uncached(self):
|
def test_hash_ring_manager_uncached(self):
|
||||||
ring_mgr = hash_ring.HashRingManager(cache=False,
|
ring_mgr = hash_ring.HashRingManager(cache=False,
|
||||||
use_groups=self.use_groups)
|
use_groups=self.use_groups)
|
||||||
|
@ -130,14 +130,16 @@ class RPCAPITestCase(db_base.DbTestCase):
|
|||||||
|
|
||||||
def test_get_topic_for_driver_unknown_driver(self):
|
def test_get_topic_for_driver_unknown_driver(self):
|
||||||
CONF.set_override('host', 'fake-host')
|
CONF.set_override('host', 'fake-host')
|
||||||
self.dbapi.register_conductor({
|
c = self.dbapi.register_conductor({
|
||||||
'hostname': 'fake-host',
|
'hostname': 'fake-host',
|
||||||
'drivers': [],
|
'drivers': [],
|
||||||
})
|
})
|
||||||
|
self.dbapi.register_conductor_hardware_interfaces(
|
||||||
|
c.id, 'fake-driver', 'deploy', ['iscsi', 'direct'], 'iscsi')
|
||||||
rpcapi = conductor_rpcapi.ConductorAPI(topic='fake-topic')
|
rpcapi = conductor_rpcapi.ConductorAPI(topic='fake-topic')
|
||||||
self.assertRaises(exception.DriverNotFound,
|
self.assertRaises(exception.DriverNotFound,
|
||||||
rpcapi.get_topic_for_driver,
|
rpcapi.get_topic_for_driver,
|
||||||
'fake-driver')
|
'fake-driver-2')
|
||||||
|
|
||||||
def test_get_topic_for_driver_doesnt_cache(self):
|
def test_get_topic_for_driver_doesnt_cache(self):
|
||||||
CONF.set_override('host', 'fake-host')
|
CONF.set_override('host', 'fake-host')
|
||||||
|
13
releasenotes/notes/hash-ring-race-da0d584de1f46788.yaml
Normal file
13
releasenotes/notes/hash-ring-race-da0d584de1f46788.yaml
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
---
|
||||||
|
fixes:
|
||||||
|
- |
|
||||||
|
Fixes a race condition in the hash ring implementation that could cause
|
||||||
|
an internal server error on any request. See `story 2003966
|
||||||
|
<https://storyboard.openstack.org/#!/story/2003966>`_ for details.
|
||||||
|
upgrade:
|
||||||
|
- |
|
||||||
|
The ``hash_ring_reset_interval`` configuration option was changed from 180
|
||||||
|
to 15 seconds. Previously, this option was essentially ignored on the API
|
||||||
|
side, becase the hash ring was reset on each API access. The lower value
|
||||||
|
minimizes the probability of a request routed to a wrong conductor when the
|
||||||
|
ring needs rebalancing.
|
Loading…
Reference in New Issue
Block a user