fix(MongoDB): Driver does not retry on AutoReconnect errors

When a primary MongoDB node fails over to a secondary, pymongo raises an
AutoReconnect error. Let's catch that and retry the operation so that
we truly are Highly Available (in the sense that the user will never notice
the few ms of "downtime" caused by a failover).

This is particularly important when hosting backend with a DBaaS that
routinely fails over the master as a way of compacting shards.

NOTE: In order to get all MongoDB tests green, a tiny unrelated bug in
test_shards was fixed as part of this patch.

Closes-Bug: 1214973
Change-Id: Ibf172e30ec6e7fa0bbb8fdcebda9e985d1e49714
This commit is contained in:
kgriffs 2014-01-30 11:09:31 -06:00
parent 266ca5380a
commit c13fb6f93e
8 changed files with 136 additions and 3 deletions

View File

@ -57,6 +57,7 @@ class ClaimController(storage.Claim):
""" """
@utils.raises_conn_error @utils.raises_conn_error
@utils.retries_on_autoreconnect
def get(self, queue, claim_id, project=None): def get(self, queue, claim_id, project=None):
msg_ctrl = self.driver.message_controller msg_ctrl = self.driver.message_controller
@ -97,7 +98,14 @@ class ClaimController(storage.Claim):
return (claim_meta, msgs) return (claim_meta, msgs)
# NOTE(kgriffs): If we get an autoreconnect or any other connection error,
# the worst that can happen is you get an orphaned claim, but it will
# expire eventually and free up those messages to be claimed again. We
# might consider setting a "claim valid" flag similar to how posting
# messages works, in order to avoid this situation if it turns out to
# be a real problem for users.
@utils.raises_conn_error @utils.raises_conn_error
@utils.retries_on_autoreconnect
def create(self, queue, metadata, project=None, def create(self, queue, metadata, project=None,
limit=storage.DEFAULT_MESSAGES_PER_CLAIM): limit=storage.DEFAULT_MESSAGES_PER_CLAIM):
"""Creates a claim. """Creates a claim.
@ -190,6 +198,7 @@ class ClaimController(storage.Claim):
return (str(oid), messages) return (str(oid), messages)
@utils.raises_conn_error @utils.raises_conn_error
@utils.retries_on_autoreconnect
def update(self, queue, claim_id, metadata, project=None): def update(self, queue, claim_id, metadata, project=None):
cid = utils.to_oid(claim_id) cid = utils.to_oid(claim_id)
if cid is None: if cid is None:
@ -233,6 +242,7 @@ class ClaimController(storage.Claim):
upsert=False, multi=True) upsert=False, multi=True)
@utils.raises_conn_error @utils.raises_conn_error
@utils.retries_on_autoreconnect
def delete(self, queue, claim_id, project=None): def delete(self, queue, claim_id, project=None):
msg_ctrl = self.driver.message_controller msg_ctrl = self.driver.message_controller
msg_ctrl._unclaim(queue, claim_id, project=project) msg_ctrl._unclaim(queue, claim_id, project=project)

View File

@ -425,6 +425,7 @@ class MessageController(storage.Message):
yield str(marker_id['next']) yield str(marker_id['next'])
@utils.raises_conn_error @utils.raises_conn_error
@utils.retries_on_autoreconnect
def first(self, queue_name, project=None, sort=1): def first(self, queue_name, project=None, sort=1):
cursor = self._list(queue_name, project=project, cursor = self._list(queue_name, project=project,
include_claimed=True, sort=sort, include_claimed=True, sort=sort,
@ -437,6 +438,7 @@ class MessageController(storage.Message):
return message return message
@utils.raises_conn_error @utils.raises_conn_error
@utils.retries_on_autoreconnect
def get(self, queue_name, message_id, project=None): def get(self, queue_name, message_id, project=None):
mid = utils.to_oid(message_id) mid = utils.to_oid(message_id)
if mid is None: if mid is None:
@ -460,6 +462,7 @@ class MessageController(storage.Message):
return _basic_message(message[0], now) return _basic_message(message[0], now)
@utils.raises_conn_error @utils.raises_conn_error
@utils.retries_on_autoreconnect
def bulk_get(self, queue_name, message_ids, project=None): def bulk_get(self, queue_name, message_ids, project=None):
message_ids = [mid for mid in map(utils.to_oid, message_ids) if mid] message_ids = [mid for mid in map(utils.to_oid, message_ids) if mid]
if not message_ids: if not message_ids:
@ -485,7 +488,13 @@ class MessageController(storage.Message):
return utils.HookedCursor(messages, denormalizer) return utils.HookedCursor(messages, denormalizer)
@utils.raises_conn_error @utils.raises_conn_error
@utils.retries_on_autoreconnect
def post(self, queue_name, messages, client_uuid, project=None): def post(self, queue_name, messages, client_uuid, project=None):
# NOTE(flaper87): This method should be safe to retry on
# autoreconnect, since we've a 2-step insert for messages.
# The worst-case scenario is that we'll increase the counter
# several times and we'd end up with some non-active messages.
if not self._queue_ctrl.exists(queue_name, project): if not self._queue_ctrl.exists(queue_name, project):
raise errors.QueueDoesNotExist(queue_name, project) raise errors.QueueDoesNotExist(queue_name, project)
@ -664,6 +673,7 @@ class MessageController(storage.Message):
succeeded_ids) succeeded_ids)
@utils.raises_conn_error @utils.raises_conn_error
@utils.retries_on_autoreconnect
def delete(self, queue_name, message_id, project=None, claim=None): def delete(self, queue_name, message_id, project=None, claim=None):
# NOTE(cpp-cabrera): return early - this is an invalid message # NOTE(cpp-cabrera): return early - this is an invalid message
# id so we won't be able to find it any way # id so we won't be able to find it any way
@ -714,6 +724,7 @@ class MessageController(storage.Message):
collection.remove(query['_id'], w=0) collection.remove(query['_id'], w=0)
@utils.raises_conn_error @utils.raises_conn_error
@utils.retries_on_autoreconnect
def bulk_delete(self, queue_name, message_ids, project=None): def bulk_delete(self, queue_name, message_ids, project=None):
message_ids = [mid for mid in map(utils.to_oid, message_ids) if mid] message_ids = [mid for mid in map(utils.to_oid, message_ids) if mid]
query = { query = {

View File

@ -53,6 +53,16 @@ MONGODB_OPTIONS = [
'sleep interval, in order to decrease probability ' 'sleep interval, in order to decrease probability '
'that parallel requests will retry at the ' 'that parallel requests will retry at the '
'same instant.')), 'same instant.')),
cfg.IntOpt('max_reconnect_attempts', default=10,
help=('Maximum number of times to retry an operation that '
'failed due to a primary node failover.')),
cfg.FloatOpt('reconnect_sleep', default=0.020,
help=('Base sleep interval between attempts to reconnect '
'after a primary node failover. '
'The actual sleep time increases exponentially (power '
'of 2) each time the operation is retried.')),
] ]
MONGODB_GROUP = 'drivers:storage:mongodb' MONGODB_GROUP = 'drivers:storage:mongodb'

View File

@ -192,12 +192,23 @@ class QueueController(storage.Queue):
yield marker_name and marker_name['next'] yield marker_name and marker_name['next']
@utils.raises_conn_error @utils.raises_conn_error
@utils.retries_on_autoreconnect
def get_metadata(self, name, project=None): def get_metadata(self, name, project=None):
queue = self._get(name, project) queue = self._get(name, project)
return queue.get('m', {}) return queue.get('m', {})
@utils.raises_conn_error @utils.raises_conn_error
# @utils.retries_on_autoreconnect
def create(self, name, project=None): def create(self, name, project=None):
# NOTE(flaper87): If the connection fails after it was called
# and we retry to insert the queue, we could end up returning
# `False` because of the `DuplicatedKeyError` although the
# queue was indeed created by this API call.
#
# TODO(kgriffs): Commented out `retries_on_autoreconnect` for
# now due to the above issue, since creating a queue is less
# important to make super HA.
try: try:
# NOTE(kgriffs): Start counting at 1, and assume the first # NOTE(kgriffs): Start counting at 1, and assume the first
# message ever posted will succeed and set t to a UNIX # message ever posted will succeed and set t to a UNIX
@ -214,11 +225,13 @@ class QueueController(storage.Queue):
return True return True
@utils.raises_conn_error @utils.raises_conn_error
@utils.retries_on_autoreconnect
def exists(self, name, project=None): def exists(self, name, project=None):
query = _get_scoped_query(name, project) query = _get_scoped_query(name, project)
return self._collection.find_one(query) is not None return self._collection.find_one(query) is not None
@utils.raises_conn_error @utils.raises_conn_error
@utils.retries_on_autoreconnect
def set_metadata(self, name, metadata, project=None): def set_metadata(self, name, metadata, project=None):
rst = self._collection.update(_get_scoped_query(name, project), rst = self._collection.update(_get_scoped_query(name, project),
{'$set': {'m': metadata}}, {'$set': {'m': metadata}},
@ -229,11 +242,13 @@ class QueueController(storage.Queue):
raise errors.QueueDoesNotExist(name, project) raise errors.QueueDoesNotExist(name, project)
@utils.raises_conn_error @utils.raises_conn_error
@utils.retries_on_autoreconnect
def delete(self, name, project=None): def delete(self, name, project=None):
self.driver.message_controller._purge_queue(name, project) self.driver.message_controller._purge_queue(name, project)
self._collection.remove(_get_scoped_query(name, project)) self._collection.remove(_get_scoped_query(name, project))
@utils.raises_conn_error @utils.raises_conn_error
@utils.retries_on_autoreconnect
def stats(self, name, project=None): def stats(self, name, project=None):
if not self.exists(name, project=project): if not self.exists(name, project=project):
raise errors.QueueDoesNotExist(name, project) raise errors.QueueDoesNotExist(name, project)

View File

@ -18,12 +18,14 @@ import collections
import datetime import datetime
import functools import functools
import random import random
import time
from bson import errors as berrors from bson import errors as berrors
from bson import objectid from bson import objectid
from bson import tz_util from bson import tz_util
from pymongo import errors from pymongo import errors
from marconi.openstack.common.gettextutils import _
import marconi.openstack.common.log as logging import marconi.openstack.common.log as logging
from marconi.openstack.common import timeutils from marconi.openstack.common import timeutils
from marconi.queues.storage import errors as storage_errors from marconi.queues.storage import errors as storage_errors
@ -238,9 +240,9 @@ def get_partition(num_partitions, queue, project=None):
def raises_conn_error(func): def raises_conn_error(func):
"""Handles mongodb ConnectionFailure error """Handles the MongoDB ConnectionFailure error.
This decorator catches mongodb's ConnectionFailure This decorator catches MongoDB's ConnectionFailure
error and raises Marconi's ConnectionError instead. error and raises Marconi's ConnectionError instead.
""" """
@ -255,6 +257,46 @@ def raises_conn_error(func):
return wrapper return wrapper
def retries_on_autoreconnect(func):
"""Causes the wrapped function to be re-called on AutoReconnect.
This decorator catches MongoDB's AutoReconnect error and retries
the function call.
.. Note::
Assumes that the decorated function has defined self.driver.mongodb_conf
so that `max_reconnect_attempts` and `reconnect_sleep` can be taken
into account.
.. Warning:: The decorated function must be idempotent.
"""
@functools.wraps(func)
def wrapper(self, *args, **kwargs):
# TODO(kgriffs): Figure out a way to not have to rely on the
# presence of `mongodb_conf`
max_attemps = self.driver.mongodb_conf.max_reconnect_attempts
sleep_sec = self.driver.mongodb_conf.reconnect_sleep
for attempt in range(max_attemps):
try:
return func(self, *args, **kwargs)
break
except errors.AutoReconnect as ex:
LOG.warn(_(u'Caught AutoReconnect, retrying the '
'call to {0}').format(func))
time.sleep(sleep_sec * (2 ** attempt))
else:
LOG.error(_(u'Caught AutoReconnect, maximum attempts '
'to {0} exceeded.').format(func))
raise ex
return wrapper
class HookedCursor(object): class HookedCursor(object):
def __init__(self, cursor, denormalizer): def __init__(self, cursor, denormalizer):

View File

@ -132,7 +132,7 @@ class ShardsBaseTest(base.TestBase):
def test_put_existing_overwrites(self): def test_put_existing_overwrites(self):
# NOTE(cabrera): setUp creates default shard # NOTE(cabrera): setUp creates default shard
expect = {'weight': 20, 'uri': 'sqlalchemy://other'} expect = self.doc
self.simulate_put(self.shard, self.simulate_put(self.shard,
body=json.dumps(expect)) body=json.dumps(expect))
self.assertEqual(self.srmock.status, falcon.HTTP_201) self.assertEqual(self.srmock.status, falcon.HTTP_201)

View File

@ -12,3 +12,5 @@ port = 8888
[drivers:storage:mongodb] [drivers:storage:mongodb]
uri = mongodb://127.0.0.1:27017 uri = mongodb://127.0.0.1:27017
database = marconi_test database = marconi_test
max_reconnect_attempts = 3
reconnect_sleep = 0.001

View File

@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import collections
import datetime import datetime
import time import time
import uuid import uuid
@ -44,6 +45,20 @@ def _cleanup_databases(controller):
class MongodbUtilsTest(testing.TestBase): class MongodbUtilsTest(testing.TestBase):
config_file = 'wsgi_mongodb.conf'
def setUp(self):
super(MongodbUtilsTest, self).setUp()
self.conf.register_opts(options.MONGODB_OPTIONS,
group=options.MONGODB_GROUP)
self.mongodb_conf = self.conf[options.MONGODB_GROUP]
MockDriver = collections.namedtuple('MockDriver', 'mongodb_conf')
self.driver = MockDriver(self.mongodb_conf)
def test_scope_queue_name(self): def test_scope_queue_name(self):
self.assertEqual(utils.scope_queue_name('my-q'), '/my-q') self.assertEqual(utils.scope_queue_name('my-q'), '/my-q')
self.assertEqual(utils.scope_queue_name('my-q', None), '/my-q') self.assertEqual(utils.scope_queue_name('my-q', None), '/my-q')
@ -84,6 +99,34 @@ class MongodbUtilsTest(testing.TestBase):
self.assertRaises(ValueError, utils.calculate_backoff, 10, 10, 2, 0) self.assertRaises(ValueError, utils.calculate_backoff, 10, 10, 2, 0)
self.assertRaises(ValueError, utils.calculate_backoff, 11, 10, 2, 0) self.assertRaises(ValueError, utils.calculate_backoff, 11, 10, 2, 0)
def test_retries_on_autoreconnect(self):
num_calls = [0]
@utils.retries_on_autoreconnect
def _raises_autoreconnect(self):
num_calls[0] += 1
raise pymongo.errors.AutoReconnect()
self.assertRaises(pymongo.errors.AutoReconnect,
_raises_autoreconnect, self)
self.assertEqual(num_calls, [self.mongodb_conf.max_reconnect_attempts])
def test_retries_on_autoreconnect_neg(self):
num_calls = [0]
@utils.retries_on_autoreconnect
def _raises_autoreconnect(self):
num_calls[0] += 1
# NOTE(kgriffs): Don't exceed until the last attempt
if num_calls[0] < self.mongodb_conf.max_reconnect_attempts:
raise pymongo.errors.AutoReconnect()
# NOTE(kgriffs): Test that this does *not* raise AutoReconnect
_raises_autoreconnect(self)
self.assertEqual(num_calls, [self.mongodb_conf.max_reconnect_attempts])
@testing.requires_mongodb @testing.requires_mongodb
class MongodbDriverTest(testing.TestBase): class MongodbDriverTest(testing.TestBase):